def pyarrow_datatype_from_dict(json_dict: Dict) -> pyarrow.DataType: """ Create a DataType in PyArrow format from a Schema json format. :param json_dict: the DataType in json format :return: the DataType in PyArrow format """ "" type_class = json_dict["type"]["name"] if type_class == "dictionary": key_type = json_dict["dictionary"]["indexType"] value_type = json_dict["children"][0] key_type = pyarrow_datatype_from_dict(key_type) value_type = pyarrow_datatype_from_dict(value_type) return pyarrow.map_(key_type, value_type) elif type_class == "list": field = json_dict["children"][0] element_type = pyarrow_datatype_from_dict(field) return pyarrow.list_(element_type) elif type_class == "struct": fields = [ pyarrow_field_from_dict(field) for field in json_dict["children"] ] return pyarrow.struct(fields) elif type_class == "int" or type_class == "float" or type_class == "date": return pyarrow.type_for_alias( f'{type_class}{json_dict["type"]["bitWidth"]}') elif type_class == "time": type_info = json_dict["type"] if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" else: unit = "s" return pyarrow.type_for_alias( f'{type_class}{type_info["bitWidth"]}[{unit}]') elif type_class == "timestamp": type_info = json_dict["type"] if "unit" in type_info: if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" elif type_info["unit"] == "SECOND": unit = "s" else: unit = "ns" return pyarrow.type_for_alias(f"{type_class}[{unit}]") elif type_class.startswith("decimal"): type_info = json_dict["type"] return pyarrow.decimal128(precision=type_info["precision"], scale=type_info["scale"]) else: return pyarrow.type_for_alias(type_class)
def __arrow_ext_deserialize__(cls, storage_type, serialized): # return an instance of this subclass given the serialized # metadata. metadata = json.loads(serialized.decode()) shape = metadata["shape"] subtype = pa.type_for_alias(metadata["subtype"]) return ArrowTensorType(shape=shape, subtype=subtype)
def __init__(self, element_shape, pyarrow_dtype): if not isinstance(pyarrow_dtype, pa.DataType): pyarrow_dtype = pa.type_for_alias(str(pyarrow_dtype)) self._element_shape = tuple(element_shape) pa.ExtensionType.__init__(self, pa.list_(pyarrow_dtype), "dask-ms.tensor_type")
def construct_from_string(cls, string: str) -> ArrowDtype: """ Construct this type from a string. Parameters ---------- string : str string should follow the format f"{pyarrow_type}[pyarrow]" e.g. int64[pyarrow] """ if not isinstance(string, str): raise TypeError( f"'construct_from_string' expects a string, got {type(string)}" ) if not string.endswith("[pyarrow]"): raise TypeError(f"'{string}' must end with '[pyarrow]'") base_type = string.split("[pyarrow]")[0] try: pa_dtype = pa.type_for_alias(base_type) except ValueError as err: has_parameters = re.search(r"\[.*\]", base_type) if has_parameters: raise NotImplementedError( "Passing pyarrow type specific parameters " f"({has_parameters.group()}) in the string is not supported. " "Please construct an ArrowDtype object with a pyarrow_dtype " "instance with specific parameters." ) from err raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err return cls(pa_dtype)
def __init__(self, subtype): if not isinstance(subtype, pa.DataType): subtype = pa.type_for_alias(str(subtype)) self._subtype = subtype pa.ExtensionType.__init__(self, pa.list_(subtype, 2), "dask-ms.complex")
def construct_from_string(cls, string): """Attempt to construct this type from a string. Parameters ---------- string : str Returns ------- self : instance of 'cls' Raises ------ TypeError If a class cannot be constructed from this 'string'. Examples -------- If the extension dtype can be constructed without any arguments, the following may be an adequate implementation. >>> @classmethod ... def construct_from_string(cls, string) ... if string == cls.name: ... return cls() ... else: ... raise TypeError("Cannot construct a '{}' from " ... "'{}'".format(cls, string)) """ # Remove fletcher specific naming from the arrow type string. if string.startswith("fletcher["): string = string[9:-1] if string == "list<item: string>": return cls(pa.list_(pa.string())) return cls(pa.type_for_alias(string))
def __init__(self, subtype, inclusive: IntervalInclusiveType) -> None: # attributes need to be set first before calling # super init (as that calls serialize) assert inclusive in VALID_INCLUSIVE self._inclusive: IntervalInclusiveType = inclusive if not isinstance(subtype, pyarrow.DataType): subtype = pyarrow.type_for_alias(str(subtype)) self._subtype = subtype storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
def __init__(self, subtype, closed) -> None: # attributes need to be set first before calling # super init (as that calls serialize) assert closed in VALID_CLOSED self._closed = closed if not isinstance(subtype, pyarrow.DataType): subtype = pyarrow.type_for_alias(str(subtype)) self._subtype = subtype storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
def __init__(self, shape, subtype): # attributes need to be set first before calling # super init (as that calls serialize) self._shape = shape self._subtype = subtype if not isinstance(subtype, pa.DataType): subtype = pa.type_for_alias(str(subtype)) size = functools.reduce(operator.mul, shape) self._storage_type = pa.binary(size * subtype.bit_width // 8) pa.ExtensionType.__init__(self, self._storage_type, "tensorpandas.tensor")
def construct_from_string(cls, string): """Attempt to construct this type from a string. Parameters ---------- string : str Returns ------- self : instance of 'cls' Raises ------ TypeError If a class cannot be constructed from this 'string'. Examples -------- If the extension dtype can be constructed without any arguments, the following may be an adequate implementation. >>> @classmethod ... def construct_from_string(cls, string) ... if string == cls.name: ... return cls() ... else: ... raise TypeError("Cannot construct a '{}' from " ... "'{}'".format(cls, string)) """ if not isinstance(string, str): raise TypeError( "'construct_from_string' expects a string, got <class 'int'>") # Remove fletcher specific naming from the arrow type string. if string.startswith("fletcher["): string = string[9:-1] if string == "list<item: string>": return cls(pa.list_(pa.string())) try: type_for_alias = pa.type_for_alias(string) except (ValueError, KeyError): # pandas API expects a TypeError msg = f"Cannot construct a '{cls.__name__}' from '{string}'" raise TypeError(msg) return cls(type_for_alias)
def __init__( self, subtype, inclusive: str | None = None, closed: None | lib.NoDefault = lib.no_default, ) -> None: # attributes need to be set first before calling # super init (as that calls serialize) inclusive, closed = _warning_interval(inclusive, closed) assert inclusive in VALID_CLOSED self._closed = inclusive if not isinstance(subtype, pyarrow.DataType): subtype = pyarrow.type_for_alias(str(subtype)) self._subtype = subtype storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ('duration[s]', pa.duration('s')), ('duration[ms]', pa.duration('ms')), ('duration[us]', pa.duration('us')), ('duration[ns]', pa.duration('ns')), ('month_day_nano_interval', pa.month_day_nano_interval()), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def test_type_for_alias(): cases = [ ('i1', pa.int8()), ('int8', pa.int8()), ('i2', pa.int16()), ('int16', pa.int16()), ('i4', pa.int32()), ('int32', pa.int32()), ('i8', pa.int64()), ('int64', pa.int64()), ('u1', pa.uint8()), ('uint8', pa.uint8()), ('u2', pa.uint16()), ('uint16', pa.uint16()), ('u4', pa.uint32()), ('uint32', pa.uint32()), ('u8', pa.uint64()), ('uint64', pa.uint64()), ('f4', pa.float32()), ('float32', pa.float32()), ('f8', pa.float64()), ('float64', pa.float64()), ('date32', pa.date32()), ('date64', pa.date64()), ('string', pa.string()), ('str', pa.string()), ('binary', pa.binary()), ('time32[s]', pa.time32('s')), ('time32[ms]', pa.time32('ms')), ('time64[us]', pa.time64('us')), ('time64[ns]', pa.time64('ns')), ('timestamp[s]', pa.timestamp('s')), ('timestamp[ms]', pa.timestamp('ms')), ('timestamp[us]', pa.timestamp('us')), ('timestamp[ns]', pa.timestamp('ns')), ] for val, expected in cases: assert pa.type_for_alias(val) == expected
def infer_schema(chunk): fields = [] columns = chunk.columns dtypes = chunk.dtypes i = 0 for column in columns: dtype = dtypes[i] if dtype.name == 'object': if column.endswith('date'): fields.append(pa.field(column, pa.date32())) else: fields.append(pa.field(column, pa.string())) else: fields.append(pa.field(column, pa.type_for_alias(dtype.name))) i += 1 schema = pa.schema(fields) return schema
def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType: metadata = json.loads(serialized.decode()) subtype = pyarrow.type_for_alias(metadata["subtype"]) inclusive = metadata["inclusive"] return ArrowIntervalType(subtype, inclusive)
def __arrow_ext_deserialize__(cls, storage_type, serialized): metadata = json.loads(serialized.decode()) subtype = pyarrow.type_for_alias(metadata["subtype"]) closed = metadata["closed"] return ArrowIntervalType(subtype, closed)
def pyarrow_datatype_from_dict(json_dict: Dict[str, Any]) -> pyarrow.DataType: """ Create a DataType in PyArrow format from a Schema json format. :param json_dict: the DataType in json format :return: the DataType in PyArrow format """ type_class = json_dict["type"]["name"] if type_class == "dictionary": key_type = json_dict["dictionary"]["indexType"] value_type = json_dict["children"][0] key_type = pyarrow_datatype_from_dict(key_type) value_type = pyarrow_datatype_from_dict(value_type) return pyarrow.map_(key_type, value_type) elif "dictionary" in json_dict: key_type = { "name": "key", "type": json_dict["dictionary"]["indexType"], "nullable": json_dict["nullable"], } key = pyarrow_datatype_from_dict(key_type) if type_class == "list": value_type = { "name": "val", "type": json_dict["dictionary"]["indexType"], "nullable": json_dict["nullable"], } return pyarrow.map_( key, pyarrow.list_( pyarrow.field( "element", pyarrow.struct([pyarrow_field_from_dict(value_type) ]))), ) value_type = { "name": "value", "type": json_dict["type"], "nullable": json_dict["nullable"], } return pyarrow.map_(key, pyarrow_datatype_from_dict(value_type)) elif type_class == "list": field = json_dict["children"][0] element_type = pyarrow_datatype_from_dict(field) return pyarrow.list_(pyarrow.field("element", element_type)) elif type_class == "struct": fields = [ pyarrow_field_from_dict(field) for field in json_dict["children"] ] return pyarrow.struct(fields) elif type_class == "int": return pyarrow.type_for_alias( f'{type_class}{json_dict["type"]["bitWidth"]}') elif type_class == "date": type_info = json_dict["type"] if type_info["unit"] == "DAY": return pyarrow.date32() else: return pyarrow.date64() elif type_class == "time": type_info = json_dict["type"] if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" else: unit = "s" return pyarrow.type_for_alias( f'{type_class}{type_info["bitWidth"]}[{unit}]') elif type_class == "timestamp": type_info = json_dict["type"] if "unit" in type_info: if type_info["unit"] == "MICROSECOND": unit = "us" elif type_info["unit"] == "NANOSECOND": unit = "ns" elif type_info["unit"] == "MILLISECOND": unit = "ms" elif type_info["unit"] == "SECOND": unit = "s" else: unit = "ns" return pyarrow.type_for_alias(f"{type_class}[{unit}]") elif type_class.startswith("decimal"): type_info = json_dict["type"] return pyarrow.decimal128(precision=type_info["precision"], scale=type_info["scale"]) elif type_class.startswith("floatingpoint"): type_info = json_dict["type"] if type_info["precision"] == "HALF": return pyarrow.float16() elif type_info["precision"] == "SINGLE": return pyarrow.float32() elif type_info["precision"] == "DOUBLE": return pyarrow.float64() else: return pyarrow.type_for_alias(type_class)