Exemple #1
0
def _pandas_series_to_arrow(
    values: pd.Series | pd.DatetimeIndex,
    nan_to_none: bool = True,
    min_len: int | None = None,
) -> pa.Array:
    """
    Convert a pandas Series to an Arrow Array.

    Parameters
    ----------
    values
        Series to convert to arrow
    nan_to_none
        Interpret `NaN` as missing values
    min_len
        in case of null values, this length will be used to create a dummy f64 array (with all values set to null)

    Returns
    -------
    """
    dtype = values.dtype
    if dtype == "object" and len(values) > 0:
        first_non_none = _get_first_non_none(
            values.values)  # type: ignore[arg-type]

        if isinstance(first_non_none, str):
            return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)
        if first_non_none is None:
            return pa.nulls(min_len, pa.large_utf8())

        return pa.array(values, from_pandas=nan_to_none)
    else:
        return pa.array(values, from_pandas=nan_to_none)
Exemple #2
0
def coerce_arrow(array: pa.Array) -> pa.Array:
    # also coerces timezone to naive representation
    # units are accounted for by pyarrow
    if "timestamp" in str(array.type):
        warnings.warn(
            "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost",
        )
        ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False)
        ms = pa.compute.cast(ts_ms, pa.int64())
        del ts_ms
        array = pa.compute.cast(ms, pa.date64())
        del ms
    # note: Decimal256 could not be cast to float
    elif isinstance(array.type, pa.Decimal128Type):
        array = pa.compute.cast(array, pa.float64())

    # simplest solution is to cast to (large)-string arrays
    # this is copy and expensive
    elif isinstance(array.type, pa.DictionaryType):
        if pa.types.is_string(array.type.value_type):
            array = pa.compute.cast(array, pa.large_utf8())
        else:
            raise ValueError(
                "polars does not support dictionary encoded types other than strings"
            )

    if hasattr(array, "num_chunks") and array.num_chunks > 1:
        if pa.types.is_string(array.type):
            array = pa.compute.cast(array, pa.large_utf8())
        elif pa.types.is_list(array.type):
            array = pa.compute.cast(array, pa.large_list())
        array = array.combine_chunks()
    return array
Exemple #3
0
def from_pandas(df: "pandas.DataFrame", rechunk: bool = True) -> "DataFrame":
    """
    Convert from a pandas DataFrame to a polars DataFrame

    Parameters
    ----------
    df
        DataFrame to convert
    rechunk
        Make sure that all data is contiguous.

    Returns
    -------
    A Polars DataFrame
    """

    # Note: we first tried to infer the schema via pyarrow and then modify the schema if needed.
    # However arrow 3.0 determines the type of a string like this:
    #       pa.array(array).type
    # needlessly allocating and failing when the string is too large for the string dtype.
    data = {}

    for (name, dtype) in zip(df.columns, df.dtypes):
        if dtype == "object" and isinstance(df[name][0], str):
            data[name] = pa.array(df[name], pa.large_utf8())
        elif dtype == "datetime64[ns]":
            data[name] = pa.compute.cast(
                pa.array(np.array(df[name].values, dtype="datetime64[ms]")),
                pa.date64())
        else:
            data[name] = pa.array(df[name])

    table = pa.table(data)
    return from_arrow_table(table, rechunk)
Exemple #4
0
def coerce_arrow(array: pa.Array) -> pa.Array:
    # also coerces timezone to naive representation
    # units are accounted for by pyarrow
    if "timestamp" in str(array.type):
        warnings.warn(
            "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost",
        )
        ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False)
        ms = pa.compute.cast(ts_ms, pa.int64())
        del ts_ms
        array = pa.compute.cast(ms, pa.date64())
        del ms
    # note: Decimal256 could not be cast to float
    elif isinstance(array.type, pa.Decimal128Type):
        array = pa.compute.cast(array, pa.float64())

    if hasattr(array, "num_chunks") and array.num_chunks > 1:
        # we have to coerce before combining chunks, because pyarrow panics if
        # offsets overflow
        if pa.types.is_string(array.type):
            array = pa.compute.cast(array, pa.large_utf8())
        elif pa.types.is_list(array.type):
            # pyarrow does not seem to support casting from list to largelist
            # so we use convert to large list ourselves and do the re-alloc on polars/arrow side
            chunks = []
            for arr in array.iterchunks():
                chunks.append(pl.from_arrow(arr).to_arrow())
            array = pa.chunked_array(chunks)

        array = array.combine_chunks()
    return array
Exemple #5
0
    def test_column_types_dict(self):
        # Ask for dict-encoded column types in ConvertOptions
        column_types = [('a', pa.dictionary(pa.int32(), pa.utf8())),
                        ('b', pa.dictionary(pa.int32(), pa.int64())),
                        ('c', pa.dictionary(pa.int32(), pa.decimal128(11, 2))),
                        ('d', pa.dictionary(pa.int32(), pa.large_utf8()))]

        opts = ConvertOptions(column_types=dict(column_types))
        rows = (b"a,b,c,d\n"
                b"abc,123456,1.0,zz\n"
                b"defg,123456,0.5,xx\n"
                b"abc,N/A,1.0,xx\n")
        table = self.read_bytes(rows, convert_options=opts)

        schema = pa.schema(column_types)
        expected = {
            'a': ["abc", "defg", "abc"],
            'b': [123456, 123456, None],
            'c': [Decimal("1.00"),
                  Decimal("0.50"),
                  Decimal("1.00")],
            'd': ["zz", "xx", "xx"],
        }
        assert table.schema == schema
        assert table.to_pydict() == expected

        # Unsupported index type
        column_types[0] = ('a', pa.dictionary(pa.int8(), pa.utf8()))

        opts = ConvertOptions(column_types=dict(column_types))
        with pytest.raises(NotImplementedError):
            table = self.read_bytes(rows, convert_options=opts)
def test_type_ids():
    # Having this fixed is very important because internally we rely on this id
    # to parse from python
    for idx, arrow_type in [
        (0, pa.null()),
        (1, pa.bool_()),
        (2, pa.uint8()),
        (3, pa.int8()),
        (4, pa.uint16()),
        (5, pa.int16()),
        (6, pa.uint32()),
        (7, pa.int32()),
        (8, pa.uint64()),
        (9, pa.int64()),
        (10, pa.float16()),
        (11, pa.float32()),
        (12, pa.float64()),
        (13, pa.string()),
        (13, pa.utf8()),
        (14, pa.binary()),
        (16, pa.date32()),
        (17, pa.date64()),
        (18, pa.timestamp("us")),
        (19, pa.time32("s")),
        (20, pa.time64("us")),
        (23, pa.decimal128(8, 1)),
        (34, pa.large_utf8()),
        (35, pa.large_binary()),
    ]:
        assert idx == arrow_type.id
Exemple #7
0
def _pandas_series_to_arrow(
    values: Union["pd.Series", "pd.DatetimeIndex"],
    nan_to_none: bool = True,
    min_len: Optional[int] = None,
) -> "pa.Array":
    """
    Convert a pandas Series to an Arrow Array.

    Parameters
    ----------
    values
        Series to convert to arrow
    nan_to_none
        Interpret `NaN` as missing values
    min_len
        in case of null values, this length will be used to create a dummy f64 array (with all values set to null)

    Returns
    -------
    """
    dtype = values.dtype
    if dtype == "object" and len(values) > 0:
        if isinstance(values.values[0], str):
            return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)

        # array is null array, we set to a float64 array
        if values.values[0] is None and min_len is not None:
            return pa.nulls(min_len, pa.float64())
        else:
            return pa.array(values, from_pandas=nan_to_none)
    else:
        return pa.array(values, from_pandas=nan_to_none)
Exemple #8
0
def _resize_arrow_type(t):
    if t == pa.string():
        return pa.large_string()
    if t == pa.utf8():
        return pa.large_utf8()
    if t == pa.binary():
        return pa.large_binary()
    if isinstance(t, pa.lib.ListType):
        return pa.large_list(t.value_type)
    return t
Exemple #9
0
def test_arrow_dict_to_polars() -> None:
    pa_dict = pa.DictionaryArray.from_arrays(
        indices=np.array([0, 1, 2, 3, 1, 0, 2, 3, 3, 2]),
        dictionary=np.array(["AAA", "BBB", "CCC", "DDD"]),
    ).cast(pa.large_utf8())

    s = pl.Series(
        name="pa_dict",
        values=["AAA", "BBB", "CCC", "DDD", "BBB", "AAA", "CCC", "DDD", "DDD", "CCC"],
    )

    assert s.series_equal(pl.Series("pa_dict", pa_dict))
def _map_arrow_type(arrow_type):
    arrow_to_dh = {
        pa.null(): '',
        pa.bool_(): '',
        pa.int8(): 'byte',
        pa.int16(): 'short',
        pa.int32(): 'int',
        pa.int64(): 'long',
        pa.uint8(): '',
        pa.uint16(): 'char',
        pa.uint32(): '',
        pa.uint64(): '',
        pa.float16(): '',
        pa.float32(): 'float',
        pa.float64(): 'double',
        pa.time32('s'): '',
        pa.time32('ms'): '',
        pa.time64('us'): '',
        pa.time64('ns'): 'io.deephaven.time.DateTime',
        pa.timestamp('us', tz=None): '',
        pa.timestamp('ns', tz=None): '',
        pa.date32(): 'java.time.LocalDate',
        pa.date64(): 'java.time.LocalDate',
        pa.binary(): '',
        pa.string(): 'java.lang.String',
        pa.utf8(): 'java.lang.String',
        pa.large_binary(): '',
        pa.large_string(): '',
        pa.large_utf8(): '',
        # decimal128(int precision, int scale=0)
        # list_(value_type, int list_size=-1)
        # large_list(value_type)
        # map_(key_type, item_type[, keys_sorted])
        # struct(fields)
        # dictionary(index_type, value_type, …)
        # field(name, type, bool nullable = True[, metadata])
        # schema(fields[, metadata])
        # from_numpy_dtype(dtype)
    }

    dh_type = arrow_to_dh.get(arrow_type)
    if not dh_type:
        # if this is a case of timestamp with tz specified
        if isinstance(arrow_type, pa.TimestampType):
            dh_type = "io.deephaven.time.DateTime"

    if not dh_type:
        raise DHError(f'unsupported arrow data type : {arrow_type}')

    return {"deephaven:type": dh_type}
Exemple #11
0
def _pandas_series_to_arrow(
        values: Union["pd.Series", "pd.DatetimeIndex"]) -> pa.Array:
    """
    Convert a pandas Series to an Arrow array.
    """
    dtype = values.dtype
    if dtype == "datetime64[ns]":
        # We first cast to ms because that's the unit of Date64,
        # Then we cast to via int64 to date64. Casting directly to Date64 lead to
        # loss of time information https://github.com/ritchie46/polars/issues/476
        arr = pa.array(np.array(values.values, dtype="datetime64[ms]"))
        arr = pa.compute.cast(arr, pa.int64())
        return pa.compute.cast(arr, pa.date64())
    elif dtype == "object" and len(values) > 0 and isinstance(
            values.iloc[0], str):
        return pa.array(values, pa.large_utf8())
    else:
        return pa.array(values)
Exemple #12
0
def _pandas_series_to_arrow(
    values: Union["pd.Series", "pd.DatetimeIndex"],
    nan_to_none: bool = True,
    min_len: Optional[int] = None,
) -> "pa.Array":
    """
    Convert a pandas Series to an Arrow Array.

    Parameters
    ----------
    values
        Series to convert to arrow
    nan_to_none
        Interpret `NaN` as missing values
    min_len
        in case of null values, this length will be used to create a dummy f64 array (with all values set to null)

    Returns
    -------
    """
    dtype = values.dtype
    if dtype == "datetime64[ns]":
        # We first cast to ms because that's the unit of Datetime,
        # Then we cast to via int64 to datetime. Casting directly to Datetime lead to
        # loss of time information https://github.com/pola-rs/polars/issues/476
        arr = pa.array(np.array(values.values, dtype="datetime64[ms]"),
                       from_pandas=nan_to_none)
        arr = pa.compute.cast(arr, pa.int64())
        return pa.compute.cast(arr, pa.timestamp("ms"))
    elif dtype == "object" and len(values) > 0:
        if isinstance(values.values[0], str):
            return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)

        # array is null array, we set to a float64 array
        if values.values[0] is None and min_len is not None:
            return pa.nulls(min_len, pa.float64())
        else:
            return pa.array(values, from_pandas=nan_to_none)
    else:
        return pa.array(values, from_pandas=nan_to_none)
Exemple #13
0
def from_pandas(
        df: "pandas.DataFrame",
        rechunk: bool = True  # noqa: F821
) -> "DataFrame":
    """
    Convert from a pandas DataFrame to a polars DataFrame

    Parameters
    ----------
    df
        DataFrame to convert
    rechunk
        Make sure that all data is contiguous.

    Returns
    -------
    A Polars DataFrame
    """

    # Note: we first tried to infer the schema via pyarrow and then modify the schema if needed.
    # However arrow 3.0 determines the type of a string like this:
    #       pa.array(array).type
    # needlessly allocating and failing when the string is too large for the string dtype.
    data = {}

    for (name, dtype) in zip(df.columns, df.dtypes):
        if dtype == "object" and isinstance(df[name][0], str):
            data[name] = pa.array(df[name], pa.large_utf8())
        elif dtype == "datetime64[ns]":
            # We first cast to ms because that's the unit of Date64
            # Then we cast to via int64 to date64. Casting directly to Date64 lead to
            # loss of time information https://github.com/ritchie46/polars/issues/476
            arr = pa.array(np.array(df[name].values, dtype="datetime64[ms]"))
            arr = pa.compute.cast(arr, pa.int64())
            data[name] = pa.compute.cast(arr, pa.date64())
        else:
            data[name] = pa.array(df[name])

    table = pa.table(data)
    return from_arrow(table, rechunk)
Exemple #14
0
def convert_string_column(col: ColumnObject) -> pa.Array:
    """
    Convert a string column to a Arrow array.
    """
    # Missing
    if col.null_count > 0:
        if col.describe_null != (3, 0):
            raise TypeError("Only support arrow style mask data")

    # Retrieve the data buffers
    buffers = col.get_buffers()

    dbuffer, bdtype = buffers["data"]  # buffer containing the UTF-8 code units
    obuffer, odtype = buffers["offsets"]  #  buffer containing the index offsets demarcating the beginning and end of each string
    mbuffer, mdtype = buffers["validity"]  # buffer indicating the presence of missing values

    # Convert the buffers to NumPy arrays
    dt = (_DtypeKind.UINT, 8, None, None)  # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
    dbuf = buffer_to_ndarray(dbuffer, dt)

    obuf = buffer_to_ndarray(obuffer, odtype)
    mbuf = buffer_to_ndarray(mbuffer, mdtype)

    # not sure what the best way to communicate the two types of strings is
    if obuffer._x.dtype == "int64":
        arrow_type = pa.large_utf8()
    elif obuffer._x.dtype == "int32":
        arrow_type = pa.utf8()
    else:
        raise TypeError(f"oops")
    length = obuf.size - 1

    buffers = [None, pa.py_buffer(obuf), pa.py_buffer(dbuf)]
    arrow_array = pa.Array.from_buffers(arrow_type, length, buffers)

    # Apply the mask
    if col.null_count > 0:
        arrow_array = pa.array(arrow_array.tolist(), mask=mbuf)

    return arrow_array, buffers
Exemple #15
0
def coerce_arrow(array: "pa.Array") -> "pa.Array":
    if array.type == pa.timestamp("s"):
        array = pa.compute.cast(
            pa.compute.multiply(pa.compute.cast(array, pa.int64()), 1000),
            pa.date64(),
        )
    elif array.type == pa.timestamp("ms"):
        array = pa.compute.cast(pa.compute.cast(array, pa.int64()),
                                pa.date64())
    elif array.type == pa.timestamp("us"):
        array = pa.compute.cast(
            pa.compute.divide(pa.compute.cast(array, pa.int64()), 1000),
            pa.date64(),
        )
    elif array.type == pa.timestamp("ns"):
        array = pa.compute.cast(
            pa.compute.divide(pa.compute.cast(array, pa.int64()), 1000000),
            pa.date64(),
        )
    # note: Decimal256 could not be cast to float
    elif isinstance(array.type, pa.Decimal128Type):
        array = pa.compute.cast(array, pa.float64())

    # simplest solution is to cast to (large)-string arrays
    # this is copy and expensive
    elif isinstance(array, pa.DictionaryArray):
        if array.dictionary.type == pa.string():
            array = pa.compute.cast(pa.compute.cast(array, pa.utf8()),
                                    pa.large_utf8())
        else:
            raise ValueError(
                "polars does not support dictionary encoded types other than strings"
            )

    if hasattr(array, "num_chunks") and array.num_chunks > 1:
        array = array.combine_chunks()
    return array
Exemple #16
0
def test_no_mem_copy():
    strings = ["a", "", "cdef", "", "g"]
    # data for above string array
    dbuf = np.array([97, 99, 100, 101, 102, 103], dtype='uint8')
    obuf = np.array([0, 1, 1, 5, 5, 6], dtype='int64')
    length = 5
    buffers = [None, pa.py_buffer(obuf), pa.py_buffer(dbuf)]
    s = pa.Array.from_buffers(pa.large_utf8(), length, buffers)
    x = np.arange(0, 5)
    df = vaex.from_arrays(x=x, s=s)
    df2 = _from_dataframe_to_vaex(df.__dataframe__())

    # primitive data
    x[0] = 999
    assert df2.x.tolist() == [999, 1, 2, 3, 4]

    # strings
    assert df.s.tolist() == strings
    assert df2.s.tolist() == strings
    # mutate the buffer data (which actually arrow and vaex both don't support/want)
    strings[0] = "b"
    dbuf[0] += 1
    assert df.s.tolist() == strings
    assert df2.s.tolist() == strings
 ("decimal128(38,1)", pa.decimal128(38, 1)),
 ("decimal128(1,2)", pa.decimal128(1, 2)),
 ("time32(s)", pa.time32("s")),
 ("time32(ms)", pa.time32("ms")),
 ("time64(us)", pa.time64("us")),
 ("time64(ns)", pa.time64("ns")),
 ("timestamp(s)", pa.timestamp("s")),
 ("timestamp(ms)", pa.timestamp("ms")),
 ("timestamp(us)", pa.timestamp("us")),
 ("timestamp(ns)", pa.timestamp("ns")),
 ("date32", pa.date32()),
 ("date64", pa.date64()),
 ("string", pa.string()),
 ("large_string", pa.large_string()),
 ("utf8", pa.utf8()),
 ("large_utf8", pa.large_utf8()),
 ("binary", pa.binary()),
 ("binary(128)", pa.binary(128)),
 ("large_binary", pa.large_binary()),
 ("struct<num:int64>", pa.struct([("num", pa.int64())])),
 ("list<int64>", pa.list_(pa.int64())),
 ("list_<list<int64>>", pa.list_(pa.list_(pa.int64()))),
 ("list_<int64>", pa.list_(pa.int64())),
 ("list_<list_<int64>>", pa.list_(pa.list_(pa.int64()))),
 ("large_list<int64>", pa.large_list(pa.int64())),
 ("large_list<large_list<int64>>",
  pa.large_list(pa.large_list(pa.int64()))),
 (
     "struct<num:int64, newnum:int64>",
     pa.struct([("num", pa.int64()), ("newnum", pa.int64())]),
 ),
Exemple #18
0
    def __init__(
        self,
        name: str,
        values: "Union[np.array, List[Optional[Any]]]" = None,
        nullable: bool = True,
        dtype: "Optional[DataType]" = None,
    ):
        """

        Parameters
        ----------
        name
            Name of the series
        values
            Values of the series
        nullable
            If nullable.
                None values in a list will be interpreted as missing.
                NaN values in a numpy array will be interpreted as missing. Note that missing and NaNs are not the same
                in Polars
            Series creation may be faster if set to False and there are no null values.
        """
        # assume the first input were the values
        if values is None and not isinstance(name, str):
            values = name
            name = ""
        if values.__class__ == self.__class__:
            values.rename(name)
            self._s = values._s
            return

        self._s: PySeries
        # series path
        if isinstance(values, Series):
            self._from_pyseries(values)
            return
        elif isinstance(values, dict):
            raise ValueError(
                f"Constructing a Series with a dict is not supported for {values}"
            )
        elif isinstance(values, pa.Array):
            self._s = self.from_arrow(name, values)._s
            return

        # castable to numpy
        if not isinstance(values, np.ndarray) and not nullable:
            values = np.array(values)

        if dtype is not None:
            if dtype == Int8:
                self._s = PySeries.new_i8(name, values)
            elif dtype == Int16:
                self._s = PySeries.new_i16(name, values)
            elif dtype == Int32:
                self._s = PySeries.new_i32(name, values)
            elif dtype == Int64:
                self._s = PySeries.new_i64(name, values)
            elif dtype == UInt8:
                self._s = PySeries.new_u8(name, values)
            elif dtype == UInt16:
                self._s = PySeries.new_u16(name, values)
            elif dtype == UInt32:
                self._s = PySeries.new_u32(name, values)
            elif dtype == UInt64:
                self._s = PySeries.new_u64(name, values)
            elif dtype == Float32:
                self._s = PySeries.new_f32(name, values)
            elif dtype == Float64:
                self._s = PySeries.new_f64(name, values)
            elif dtype == Boolean:
                self._s = PySeries.new_bool(name, values)
            elif dtype == Utf8:
                self._s = PySeries.new_str(name, values)
            else:
                raise ValueError(
                    f"dtype {dtype} not yet supported when creating a Series")
            return

        # numpy path
        if isinstance(values, np.ndarray):
            if not values.data.contiguous:
                values = np.array(values)
            if len(values.shape) > 1:
                self._s = PySeries.new_object(name, values)
                return
            dtype = values.dtype
            if dtype == np.int64:
                self._s = PySeries.new_i64(name, values)
            elif dtype == np.int32:
                self._s = PySeries.new_i32(name, values)
            elif dtype == np.int16:
                self._s = PySeries.new_i16(name, values)
            elif dtype == np.int8:
                self._s = PySeries.new_i8(name, values)
            elif dtype == np.float32:
                self._s = PySeries.new_f32(name, values, nullable)
            elif dtype == np.float64:
                self._s = PySeries.new_f64(name, values, nullable)
            elif isinstance(values[0], str):
                self._s = PySeries.new_str(name, values)
            elif dtype == np.bool:
                self._s = PySeries.new_bool(name, values)
            elif dtype == np.uint8:
                self._s = PySeries.new_u8(name, values)
            elif dtype == np.uint16:
                self._s = PySeries.new_u16(name, values)
            elif dtype == np.uint32:
                self._s = PySeries.new_u32(name, values)
            elif dtype == np.uint64:
                self._s = PySeries.new_u64(name, values)
            else:
                self._s = PySeries.new_object(name, values)
            return
        # list path
        else:
            dtype = _find_first_non_none(values)
            # order is important as booleans are instance of int in python.
            if isinstance(dtype, bool):
                self._s = PySeries.new_opt_bool(name, values)
            elif isinstance(dtype, int):
                self._s = PySeries.new_opt_i64(name, values)
            elif isinstance(dtype, float):
                self._s = PySeries.new_opt_f64(name, values)
            elif isinstance(dtype, str):
                self._s = PySeries.new_str(name, values)
            # make list array
            elif isinstance(dtype, (list, tuple)):
                value_dtype = _find_first_non_none(dtype)

                # we can expect a failure if we pass `[[12], "foo", 9]`
                # in that case we catch the exception and create an object type
                try:
                    if isinstance(value_dtype, bool):
                        arrow_array = pa.array(values,
                                               pa.large_list(pa.bool_()))
                    elif isinstance(value_dtype, int):
                        arrow_array = pa.array(values,
                                               pa.large_list(pa.int64()))
                    elif isinstance(value_dtype, float):
                        arrow_array = pa.array(values,
                                               pa.large_list(pa.float64()))
                    elif isinstance(value_dtype, str):
                        arrow_array = pa.array(values,
                                               pa.large_list(pa.large_utf8()))
                    else:
                        self._s = PySeries.new_object(name, values)
                        return
                    self._s = Series.from_arrow(name, arrow_array)._s

                except pa.lib.ArrowInvalid:
                    self._s = PySeries.new_object(name, values)
            else:
                self._s = PySeries.new_object(name, values)
Exemple #19
0
    }


def py_type_to_constructor(dtype: Type[Any]) -> Callable[..., "PySeries"]:
    """
    Get the right PySeries constructor for the given Python dtype.
    """
    try:
        return _PY_TYPE_TO_CONSTRUCTOR[dtype]
    except KeyError:
        return PySeries.new_object


if _PYARROW_AVAILABLE and not _DOCUMENTING:
    _PY_TYPE_TO_ARROW_TYPE = {
        float: pa.float64(),
        int: pa.int64(),
        str: pa.large_utf8(),
        bool: pa.bool_(),
    }


def py_type_to_arrow_type(dtype: Type[Any]) -> "pa.lib.DataType":
    """
    Convert a Python dtype to an Arrow dtype.
    """
    try:
        return _PY_TYPE_TO_ARROW_TYPE[dtype]
    except KeyError:
        raise ValueError(f"Cannot parse dtype {dtype} into Arrow dtype.")
Exemple #20
0
 def pickle_set_string(x):
     keys = x.key_array()
     keys = pa.array(keys.to_numpy(), type=pa.large_utf8())
     return create_set_string, (keys, x.null_value, x.nan_count, x.null_count, x.fingerprint)
class TestAbstractFileParserStatics:
    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://json-schema.org/understanding-json-schema/reference/type.html
        "input_json_type, output_pyarrow_type",
        [
            ("string", pa.large_string()),
            ("number", pa.float64()),
            ("integer", pa.int64()),
            ("object", pa.large_string()),
            ("array", pa.large_string()),
            ("boolean", pa.bool_()),
            ("null", pa.large_string()),
        ],
    )
    def test_json_type_to_pyarrow_type(self, input_json_type: str, output_pyarrow_type: Any) -> None:
        # Json -> PyArrow direction
        LOGGER.info(f"asserting that JSON type '{input_json_type}' converts to PyArrow type '{output_pyarrow_type}'...")
        assert AbstractFileParser.json_type_to_pyarrow_type(input_json_type) == output_pyarrow_type

    @pytest.mark.parametrize(  # testing all datatypes as laid out here: https://arrow.apache.org/docs/python/api/datatypes.html
        "input_pyarrow_types, output_json_type",
        [
            ((pa.null(),), "string"),  # null type
            ((pa.bool_(),), "boolean"),  # boolean type
            (
                (pa.int8(), pa.int16(), pa.int32(), pa.int64(), pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()),
                "integer",
            ),  # integer types
            ((pa.float16(), pa.float32(), pa.float64(), pa.decimal128(5, 10), pa.decimal256(3, 8)), "number"),  # number types
            ((pa.time32("s"), pa.time64("ns"), pa.timestamp("ms"), pa.date32(), pa.date64()), "string"),  # temporal types
            ((pa.binary(), pa.large_binary()), "string"),  # binary types
            ((pa.string(), pa.utf8(), pa.large_string(), pa.large_utf8()), "string"),  # string types
            ((pa.list_(pa.string()), pa.large_list(pa.timestamp("us"))), "string"),  # array types
            ((pa.map_(pa.string(), pa.float32()), pa.dictionary(pa.int16(), pa.list_(pa.string()))), "string"),  # object types
        ],
    )
    def test_json_type_to_pyarrow_type_reverse(self, input_pyarrow_types: Tuple[Any], output_json_type: str) -> None:
        # PyArrow -> Json direction (reverse=True)
        for typ in input_pyarrow_types:
            LOGGER.info(f"asserting that PyArrow type '{typ}' converts to JSON type '{output_json_type}'...")
            assert AbstractFileParser.json_type_to_pyarrow_type(typ, reverse=True) == output_json_type

    @pytest.mark.parametrize(  # if expecting fail, put pyarrow_schema as None
        "json_schema, pyarrow_schema",
        [
            (
                {"a": "string", "b": "number", "c": "integer", "d": "object", "e": "array", "f": "boolean", "g": "null"},
                {
                    "a": pa.large_string(),
                    "b": pa.float64(),
                    "c": pa.int64(),
                    "d": pa.large_string(),
                    "e": pa.large_string(),
                    "f": pa.bool_(),
                    "g": pa.large_string(),
                },
            ),
            ({"single_column": "object"}, {"single_column": pa.large_string()}),
            ({}, {}),
            ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": pa.large_string(), "b": pa.large_string()}),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema(self, json_schema: Mapping[str, Any], pyarrow_schema: Mapping[str, Any]) -> None:
        # Json -> PyArrow direction
        if pyarrow_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(json_schema) == pyarrow_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(json_schema)
                LOGGER.debug(str(e_info))

    @pytest.mark.parametrize(  # if expecting fail, put json_schema as None
        "pyarrow_schema, json_schema",
        [
            (
                {
                    "a": pa.utf8(),
                    "b": pa.float16(),
                    "c": pa.uint32(),
                    "d": pa.map_(pa.string(), pa.float32()),
                    "e": pa.bool_(),
                    "f": pa.date64(),
                },
                {"a": "string", "b": "number", "c": "integer", "d": "string", "e": "boolean", "f": "string"},
            ),
            ({"single_column": pa.int32()}, {"single_column": "integer"}),
            ({}, {}),
            ({"a": "NOT A REAL TYPE", "b": "another fake type"}, {"a": "string", "b": "string"}),
            (["string", "object"], None),  # bad input type
        ],
    )
    def test_json_schema_to_pyarrow_schema_reverse(self, pyarrow_schema: Mapping[str, Any], json_schema: Mapping[str, Any]) -> None:
        # PyArrow -> Json direction (reverse=True)
        if json_schema is not None:
            assert AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True) == json_schema
        else:
            with pytest.raises(Exception) as e_info:
                AbstractFileParser.json_schema_to_pyarrow_schema(pyarrow_schema, reverse=True)
                LOGGER.debug(str(e_info))
Exemple #22
0
 def test_large_utf8(self):
     array = pyarrow.array(
         ["a", "b", "c"], pyarrow.large_utf8(), numpy.array([False, True, False])
     )
     self._test_data(array)
Exemple #23
0
 testcase_name="basic_types",
 inputs=[
     {
         "bool": pa.array([False, None, True], type=pa.bool_()),
         "int64": pa.array([1, None, 3], type=pa.int64()),
         "uint64": pa.array([1, None, 3], type=pa.uint64()),
         "int32": pa.array([1, None, 3], type=pa.int32()),
         "uint32": pa.array([1, None, 3], type=pa.uint32()),
         "float": pa.array([1., None, 3.], type=pa.float32()),
         "double": pa.array([1., None, 3.], type=pa.float64()),
         "bytes": pa.array([b"abc", None, b"ghi"], type=pa.binary()),
         "large_bytes": pa.array([b"abc", None, b"ghi"],
                                 type=pa.large_binary()),
         "unicode": pa.array([u"abc", None, u"ghi"], type=pa.utf8()),
         "large_unicode": pa.array([u"abc", None, u"ghi"],
                                   type=pa.large_utf8()),
     },
     {
         "bool": pa.array([None, False], type=pa.bool_()),
         "int64": pa.array([None, 4], type=pa.int64()),
         "uint64": pa.array([None, 4], type=pa.uint64()),
         "int32": pa.array([None, 4], type=pa.int32()),
         "uint32": pa.array([None, 4], type=pa.uint32()),
         "float": pa.array([None, 4.], type=pa.float32()),
         "double": pa.array([None, 4.], type=pa.float64()),
         "bytes": pa.array([None, b"jkl"], type=pa.binary()),
         "large_bytes": pa.array([None, b"jkl"], type=pa.large_binary()),
         "unicode": pa.array([None, u"jkl"], type=pa.utf8()),
         "large_unicode": pa.array([None, u"jkl"], type=pa.large_utf8()),
     },
 ],
Exemple #24
0
     "int32":
     pa.array([1, None, 3], type=pa.int32()),
     "uint32":
     pa.array([1, None, 3], type=pa.uint32()),
     "float":
     pa.array([1., None, 3.], type=pa.float32()),
     "double":
     pa.array([1., None, 3.], type=pa.float64()),
     "bytes":
     pa.array([b"abc", None, b"ghi"], type=pa.binary()),
     "large_bytes":
     pa.array([b"abc", None, b"ghi"], type=pa.large_binary()),
     "unicode":
     pa.array([u"abc", None, u"ghi"], type=pa.utf8()),
     "large_unicode":
     pa.array([u"abc", None, u"ghi"], type=pa.large_utf8()),
 },
 {
     "bool": pa.array([None, False], type=pa.bool_()),
     "int64": pa.array([None, 4], type=pa.int64()),
     "uint64": pa.array([None, 4], type=pa.uint64()),
     "int32": pa.array([None, 4], type=pa.int32()),
     "uint32": pa.array([None, 4], type=pa.uint32()),
     "float": pa.array([None, 4.], type=pa.float32()),
     "double": pa.array([None, 4.], type=pa.float64()),
     "bytes": pa.array([None, b"jkl"], type=pa.binary()),
     "large_bytes": pa.array([None, b"jkl"],
                             type=pa.large_binary()),
     "unicode": pa.array([None, u"jkl"], type=pa.utf8()),
     "large_unicode": pa.array([None, u"jkl"],
                               type=pa.large_utf8()),
Exemple #25
0
    ctx.register_udf("udf", fn, input_types, output_type)

    batches = ctx.sql("SELECT udf(a) AS tt FROM t").collect()
    result = batches[0].column(0)

    assert result == pa.array(expected_values)


_null_mask = np.array([False, True, False])


@pytest.mark.parametrize(
    "arr",
    [
        pa.array(["a", "b", "c"], pa.utf8(), _null_mask),
        pa.array(["a", "b", "c"], pa.large_utf8(), _null_mask),
        pa.array([b"1", b"2", b"3"], pa.binary(), _null_mask),
        pa.array([b"1111", b"2222", b"3333"], pa.large_binary(), _null_mask),
        pa.array([False, True, True], None, _null_mask),
        pa.array([0, 1, 2], None),
        helpers.data_binary_other(),
        helpers.data_date32(),
        helpers.data_with_nans(),
        # C data interface missing
        pytest.param(
            pa.array([b"1111", b"2222", b"3333"], pa.binary(4), _null_mask),
            marks=pytest.mark.xfail,
        ),
        pytest.param(helpers.data_datetime("s"), marks=pytest.mark.xfail),
        pytest.param(helpers.data_datetime("ms"), marks=pytest.mark.xfail),
        pytest.param(helpers.data_datetime("us"), marks=pytest.mark.xfail),
Exemple #26
0
    "i": Int32,
    ("q" if sys.platform == "win32" else "l"): Int64,
    "B": UInt8,
    "H": UInt16,
    "I": UInt32,
    ("Q" if sys.platform == "win32" else "L"): UInt64,
    "f": Float32,
    "d": Float64,
    "?": Boolean,
}

if _PYARROW_AVAILABLE:
    _PY_TYPE_TO_ARROW_TYPE: dict[type, pa.lib.DataType] = {
        float: pa.float64(),
        int: pa.int64(),
        str: pa.large_utf8(),
        bool: pa.bool_(),
        date: pa.date32(),
        time: pa.time64("us"),
        datetime: pa.timestamp("us"),
        timedelta: pa.duration("us"),
    }

    _DTYPE_TO_ARROW_TYPE = {
        Int8: pa.int8(),
        Int16: pa.int16(),
        Int32: pa.int32(),
        Int64: pa.int64(),
        UInt8: pa.uint8(),
        UInt16: pa.uint16(),
        UInt32: pa.uint32(),