Beispiel #1
0
def pyarrow_datatype_from_dict(json_dict: Dict) -> pyarrow.DataType:
    """
    Create a DataType in PyArrow format from a Schema json format.
    :param json_dict: the DataType in json format
    :return: the DataType in PyArrow format
    """ ""
    type_class = json_dict["type"]["name"]
    if type_class == "dictionary":
        key_type = json_dict["dictionary"]["indexType"]
        value_type = json_dict["children"][0]
        key_type = pyarrow_datatype_from_dict(key_type)
        value_type = pyarrow_datatype_from_dict(value_type)
        return pyarrow.map_(key_type, value_type)
    elif type_class == "list":
        field = json_dict["children"][0]
        element_type = pyarrow_datatype_from_dict(field)
        return pyarrow.list_(element_type)
    elif type_class == "struct":
        fields = [
            pyarrow_field_from_dict(field) for field in json_dict["children"]
        ]
        return pyarrow.struct(fields)
    elif type_class == "int" or type_class == "float" or type_class == "date":
        return pyarrow.type_for_alias(
            f'{type_class}{json_dict["type"]["bitWidth"]}')
    elif type_class == "time":
        type_info = json_dict["type"]
        if type_info["unit"] == "MICROSECOND":
            unit = "us"
        elif type_info["unit"] == "NANOSECOND":
            unit = "ns"
        elif type_info["unit"] == "MILLISECOND":
            unit = "ms"
        else:
            unit = "s"
        return pyarrow.type_for_alias(
            f'{type_class}{type_info["bitWidth"]}[{unit}]')
    elif type_class == "timestamp":
        type_info = json_dict["type"]
        if "unit" in type_info:
            if type_info["unit"] == "MICROSECOND":
                unit = "us"
            elif type_info["unit"] == "NANOSECOND":
                unit = "ns"
            elif type_info["unit"] == "MILLISECOND":
                unit = "ms"
            elif type_info["unit"] == "SECOND":
                unit = "s"
        else:
            unit = "ns"
        return pyarrow.type_for_alias(f"{type_class}[{unit}]")
    elif type_class.startswith("decimal"):
        type_info = json_dict["type"]
        return pyarrow.decimal128(precision=type_info["precision"],
                                  scale=type_info["scale"])
    else:
        return pyarrow.type_for_alias(type_class)
Beispiel #2
0
 def __arrow_ext_deserialize__(cls, storage_type, serialized):
     # return an instance of this subclass given the serialized
     # metadata.
     metadata = json.loads(serialized.decode())
     shape = metadata["shape"]
     subtype = pa.type_for_alias(metadata["subtype"])
     return ArrowTensorType(shape=shape, subtype=subtype)
Beispiel #3
0
    def __init__(self, element_shape, pyarrow_dtype):
        if not isinstance(pyarrow_dtype, pa.DataType):
            pyarrow_dtype = pa.type_for_alias(str(pyarrow_dtype))

        self._element_shape = tuple(element_shape)
        pa.ExtensionType.__init__(self, pa.list_(pyarrow_dtype),
                                  "dask-ms.tensor_type")
Beispiel #4
0
    def construct_from_string(cls, string: str) -> ArrowDtype:
        """
        Construct this type from a string.

        Parameters
        ----------
        string : str
            string should follow the format f"{pyarrow_type}[pyarrow]"
            e.g. int64[pyarrow]
        """
        if not isinstance(string, str):
            raise TypeError(
                f"'construct_from_string' expects a string, got {type(string)}"
            )
        if not string.endswith("[pyarrow]"):
            raise TypeError(f"'{string}' must end with '[pyarrow]'")
        base_type = string.split("[pyarrow]")[0]
        try:
            pa_dtype = pa.type_for_alias(base_type)
        except ValueError as err:
            has_parameters = re.search(r"\[.*\]", base_type)
            if has_parameters:
                raise NotImplementedError(
                    "Passing pyarrow type specific parameters "
                    f"({has_parameters.group()}) in the string is not supported. "
                    "Please construct an ArrowDtype object with a pyarrow_dtype "
                    "instance with specific parameters."
                ) from err
            raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err
        return cls(pa_dtype)
Beispiel #5
0
    def __init__(self, subtype):
        if not isinstance(subtype, pa.DataType):
            subtype = pa.type_for_alias(str(subtype))

        self._subtype = subtype
        pa.ExtensionType.__init__(self, pa.list_(subtype, 2),
                                  "dask-ms.complex")
Beispiel #6
0
    def construct_from_string(cls, string):
        """Attempt to construct this type from a string.
        Parameters
        ----------
        string : str
        Returns
        -------
        self : instance of 'cls'
        Raises
        ------
        TypeError
            If a class cannot be constructed from this 'string'.
        Examples
        --------
        If the extension dtype can be constructed without any arguments,
        the following may be an adequate implementation.
        >>> @classmethod
        ... def construct_from_string(cls, string)
        ...     if string == cls.name:
        ...         return cls()
        ...     else:
        ...         raise TypeError("Cannot construct a '{}' from "
        ...                         "'{}'".format(cls, string))
        """
        # Remove fletcher specific naming from the arrow type string.
        if string.startswith("fletcher["):
            string = string[9:-1]

        if string == "list<item: string>":
            return cls(pa.list_(pa.string()))

        return cls(pa.type_for_alias(string))
Beispiel #7
0
    def __init__(self, subtype, inclusive: IntervalInclusiveType) -> None:
        # attributes need to be set first before calling
        # super init (as that calls serialize)
        assert inclusive in VALID_INCLUSIVE
        self._inclusive: IntervalInclusiveType = inclusive
        if not isinstance(subtype, pyarrow.DataType):
            subtype = pyarrow.type_for_alias(str(subtype))
        self._subtype = subtype

        storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
        pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
Beispiel #8
0
    def __init__(self, subtype, closed) -> None:
        # attributes need to be set first before calling
        # super init (as that calls serialize)
        assert closed in VALID_CLOSED
        self._closed = closed
        if not isinstance(subtype, pyarrow.DataType):
            subtype = pyarrow.type_for_alias(str(subtype))
        self._subtype = subtype

        storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
        pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
Beispiel #9
0
 def __init__(self, shape, subtype):
     # attributes need to be set first before calling
     # super init (as that calls serialize)
     self._shape = shape
     self._subtype = subtype
     if not isinstance(subtype, pa.DataType):
         subtype = pa.type_for_alias(str(subtype))
     size = functools.reduce(operator.mul, shape)
     self._storage_type = pa.binary(size * subtype.bit_width // 8)
     pa.ExtensionType.__init__(self, self._storage_type,
                               "tensorpandas.tensor")
Beispiel #10
0
    def construct_from_string(cls, string):
        """Attempt to construct this type from a string.

        Parameters
        ----------
        string : str

        Returns
        -------
        self : instance of 'cls'

        Raises
        ------
        TypeError
            If a class cannot be constructed from this 'string'.

        Examples
        --------
        If the extension dtype can be constructed without any arguments,
        the following may be an adequate implementation.
        >>> @classmethod
        ... def construct_from_string(cls, string)
        ...     if string == cls.name:
        ...         return cls()
        ...     else:
        ...         raise TypeError("Cannot construct a '{}' from "
        ...                         "'{}'".format(cls, string))
        """
        if not isinstance(string, str):
            raise TypeError(
                "'construct_from_string' expects a string, got <class 'int'>")

        # Remove fletcher specific naming from the arrow type string.
        if string.startswith("fletcher["):
            string = string[9:-1]

        if string == "list<item: string>":
            return cls(pa.list_(pa.string()))

        try:
            type_for_alias = pa.type_for_alias(string)
        except (ValueError, KeyError):
            # pandas API expects a TypeError
            msg = f"Cannot construct a '{cls.__name__}' from '{string}'"
            raise TypeError(msg)

        return cls(type_for_alias)
Beispiel #11
0
    def __init__(
        self,
        subtype,
        inclusive: str | None = None,
        closed: None | lib.NoDefault = lib.no_default,
    ) -> None:
        # attributes need to be set first before calling
        # super init (as that calls serialize)
        inclusive, closed = _warning_interval(inclusive, closed)
        assert inclusive in VALID_CLOSED
        self._closed = inclusive
        if not isinstance(subtype, pyarrow.DataType):
            subtype = pyarrow.type_for_alias(str(subtype))
        self._subtype = subtype

        storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
        pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
Beispiel #12
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
        ('duration[s]', pa.duration('s')),
        ('duration[ms]', pa.duration('ms')),
        ('duration[us]', pa.duration('us')),
        ('duration[ns]', pa.duration('ns')),
        ('month_day_nano_interval', pa.month_day_nano_interval()),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
Beispiel #13
0
def test_type_for_alias():
    cases = [
        ('i1', pa.int8()),
        ('int8', pa.int8()),
        ('i2', pa.int16()),
        ('int16', pa.int16()),
        ('i4', pa.int32()),
        ('int32', pa.int32()),
        ('i8', pa.int64()),
        ('int64', pa.int64()),
        ('u1', pa.uint8()),
        ('uint8', pa.uint8()),
        ('u2', pa.uint16()),
        ('uint16', pa.uint16()),
        ('u4', pa.uint32()),
        ('uint32', pa.uint32()),
        ('u8', pa.uint64()),
        ('uint64', pa.uint64()),
        ('f4', pa.float32()),
        ('float32', pa.float32()),
        ('f8', pa.float64()),
        ('float64', pa.float64()),
        ('date32', pa.date32()),
        ('date64', pa.date64()),
        ('string', pa.string()),
        ('str', pa.string()),
        ('binary', pa.binary()),
        ('time32[s]', pa.time32('s')),
        ('time32[ms]', pa.time32('ms')),
        ('time64[us]', pa.time64('us')),
        ('time64[ns]', pa.time64('ns')),
        ('timestamp[s]', pa.timestamp('s')),
        ('timestamp[ms]', pa.timestamp('ms')),
        ('timestamp[us]', pa.timestamp('us')),
        ('timestamp[ns]', pa.timestamp('ns')),
    ]

    for val, expected in cases:
        assert pa.type_for_alias(val) == expected
Beispiel #14
0
def infer_schema(chunk):
    fields = []

    columns = chunk.columns
    dtypes = chunk.dtypes

    i = 0
    for column in columns:

        dtype = dtypes[i]

        if dtype.name == 'object':
            if column.endswith('date'):
                fields.append(pa.field(column, pa.date32()))
            else:
                fields.append(pa.field(column, pa.string()))
        else:
            fields.append(pa.field(column, pa.type_for_alias(dtype.name)))

        i += 1

    schema = pa.schema(fields)

    return schema
Beispiel #15
0
 def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
     metadata = json.loads(serialized.decode())
     subtype = pyarrow.type_for_alias(metadata["subtype"])
     inclusive = metadata["inclusive"]
     return ArrowIntervalType(subtype, inclusive)
Beispiel #16
0
 def __arrow_ext_deserialize__(cls, storage_type, serialized):
     metadata = json.loads(serialized.decode())
     subtype = pyarrow.type_for_alias(metadata["subtype"])
     closed = metadata["closed"]
     return ArrowIntervalType(subtype, closed)
Beispiel #17
0
def pyarrow_datatype_from_dict(json_dict: Dict[str, Any]) -> pyarrow.DataType:
    """
    Create a DataType in PyArrow format from a Schema json format.

    :param json_dict: the DataType in json format
    :return: the DataType in PyArrow format
    """
    type_class = json_dict["type"]["name"]
    if type_class == "dictionary":
        key_type = json_dict["dictionary"]["indexType"]
        value_type = json_dict["children"][0]
        key_type = pyarrow_datatype_from_dict(key_type)
        value_type = pyarrow_datatype_from_dict(value_type)
        return pyarrow.map_(key_type, value_type)
    elif "dictionary" in json_dict:
        key_type = {
            "name": "key",
            "type": json_dict["dictionary"]["indexType"],
            "nullable": json_dict["nullable"],
        }
        key = pyarrow_datatype_from_dict(key_type)
        if type_class == "list":
            value_type = {
                "name": "val",
                "type": json_dict["dictionary"]["indexType"],
                "nullable": json_dict["nullable"],
            }
            return pyarrow.map_(
                key,
                pyarrow.list_(
                    pyarrow.field(
                        "element",
                        pyarrow.struct([pyarrow_field_from_dict(value_type)
                                        ]))),
            )
        value_type = {
            "name": "value",
            "type": json_dict["type"],
            "nullable": json_dict["nullable"],
        }
        return pyarrow.map_(key, pyarrow_datatype_from_dict(value_type))
    elif type_class == "list":
        field = json_dict["children"][0]
        element_type = pyarrow_datatype_from_dict(field)
        return pyarrow.list_(pyarrow.field("element", element_type))
    elif type_class == "struct":
        fields = [
            pyarrow_field_from_dict(field) for field in json_dict["children"]
        ]
        return pyarrow.struct(fields)
    elif type_class == "int":
        return pyarrow.type_for_alias(
            f'{type_class}{json_dict["type"]["bitWidth"]}')
    elif type_class == "date":
        type_info = json_dict["type"]
        if type_info["unit"] == "DAY":
            return pyarrow.date32()
        else:
            return pyarrow.date64()
    elif type_class == "time":
        type_info = json_dict["type"]
        if type_info["unit"] == "MICROSECOND":
            unit = "us"
        elif type_info["unit"] == "NANOSECOND":
            unit = "ns"
        elif type_info["unit"] == "MILLISECOND":
            unit = "ms"
        else:
            unit = "s"
        return pyarrow.type_for_alias(
            f'{type_class}{type_info["bitWidth"]}[{unit}]')
    elif type_class == "timestamp":
        type_info = json_dict["type"]
        if "unit" in type_info:
            if type_info["unit"] == "MICROSECOND":
                unit = "us"
            elif type_info["unit"] == "NANOSECOND":
                unit = "ns"
            elif type_info["unit"] == "MILLISECOND":
                unit = "ms"
            elif type_info["unit"] == "SECOND":
                unit = "s"
        else:
            unit = "ns"
        return pyarrow.type_for_alias(f"{type_class}[{unit}]")
    elif type_class.startswith("decimal"):
        type_info = json_dict["type"]
        return pyarrow.decimal128(precision=type_info["precision"],
                                  scale=type_info["scale"])
    elif type_class.startswith("floatingpoint"):
        type_info = json_dict["type"]
        if type_info["precision"] == "HALF":
            return pyarrow.float16()
        elif type_info["precision"] == "SINGLE":
            return pyarrow.float32()
        elif type_info["precision"] == "DOUBLE":
            return pyarrow.float64()
    else:
        return pyarrow.type_for_alias(type_class)