def test_map():
    ty = pa.map_(pa.string(), pa.int8())
    v = [('a', 1), ('b', 2)]
    s = pa.scalar(v, type=ty)

    assert len(s) == 2
    assert isinstance(s, pa.MapScalar)
    assert isinstance(s.values, pa.Array)
    assert repr(s) == "<pyarrow.MapScalar: [('a', 1), ('b', 2)]>"
    assert s.values.to_pylist() == [{
        'key': 'a',
        'value': 1
    }, {
        'key': 'b',
        'value': 2
    }]

    # test iteration
    for i, j in zip(s, v):
        assert i == j

    assert s.as_py() == v
    assert s[1] == (pa.scalar('b',
                              type=pa.string()), pa.scalar(2, type=pa.int8()))
    assert s[-1] == s[1]
    assert s[-2] == s[0]
    with pytest.raises(IndexError):
        s[-3]
    with pytest.raises(IndexError):
        s[2]

    restored = pickle.loads(pickle.dumps(s))
    assert restored.equals(s)
Exemple #2
0
def test_fill_null_chunked_array(arrow_type):
    fill_value = pa.scalar(5, type=arrow_type)
    arr = pa.chunked_array([pa.array([None, 2, 3, 4], type=arrow_type)])
    result = arr.fill_null(fill_value)
    expected = pa.chunked_array([pa.array([5, 2, 3, 4], type=arrow_type)])
    assert result.equals(expected)

    arr = pa.chunked_array([
        pa.array([1, 2], type=arrow_type),
        pa.array([], type=arrow_type),
        pa.array([None, 4], type=arrow_type)
    ])
    expected = pa.chunked_array([
        pa.array([1, 2], type=arrow_type),
        pa.array([], type=arrow_type),
        pa.array([5, 4], type=arrow_type)
    ])
    result = arr.fill_null(fill_value)
    assert result.equals(expected)

    # Implicit conversions
    result = arr.fill_null(5)
    assert result.equals(expected)

    result = arr.fill_null(pa.scalar(5, type='int8'))
    assert result.equals(expected)
def test_fixed_size_binary():
    s = pa.scalar(b'foof', type=pa.binary(4))
    assert isinstance(s, pa.FixedSizeBinaryScalar)
    assert s.as_py() == b'foof'

    with pytest.raises(pa.ArrowInvalid):
        pa.scalar(b'foof5', type=pa.binary(4))
Exemple #4
0
def index(data, value, start=None, end=None, *, memory_pool=None):
    """
    Find the index of the first occurrence of a given value.

    Parameters
    ----------
    data : Array or ChunkedArray
    value : Scalar-like object
    start : int, optional
    end : int, optional
    memory_pool : MemoryPool, optional
        If not passed, will allocate memory from the default memory pool.

    Returns
    -------
    index : the index, or -1 if not found
    """
    if start is not None:
        if end is not None:
            data = data.slice(start, end - start)
        else:
            data = data.slice(start)
    elif end is not None:
        data = data.slice(0, end)

    if not isinstance(value, pa.Scalar):
        value = pa.scalar(value, type=data.type)
    elif data.type != value.type:
        value = pa.scalar(value.as_py(), type=data.type)
    options = IndexOptions(value=value)
    result = call_function('index', [data], options, memory_pool)
    if start is not None and result.as_py() >= 0:
        result = pa.scalar(result.as_py() + start, type=pa.int64())
    return result
Exemple #5
0
def test_ext_scalar_from_storage():
    ty = UuidType()

    s = pa.ExtensionScalar.from_storage(ty, None)
    assert isinstance(s, pa.ExtensionScalar)
    assert s.type == ty
    assert s.is_valid is False
    assert s.value is None

    s = pa.ExtensionScalar.from_storage(ty, b"0123456789abcdef")
    assert isinstance(s, pa.ExtensionScalar)
    assert s.type == ty
    assert s.is_valid is True
    assert s.value == pa.scalar(b"0123456789abcdef", ty.storage_type)

    s = pa.ExtensionScalar.from_storage(ty, pa.scalar(None, ty.storage_type))
    assert isinstance(s, pa.ExtensionScalar)
    assert s.type == ty
    assert s.is_valid is False
    assert s.value is None

    s = pa.ExtensionScalar.from_storage(
        ty, pa.scalar(b"0123456789abcdef", ty.storage_type))
    assert isinstance(s, pa.ExtensionScalar)
    assert s.type == ty
    assert s.is_valid is True
    assert s.value == pa.scalar(b"0123456789abcdef", ty.storage_type)
Exemple #6
0
def test_compare_scalar(typ):
    if typ == "array":

        def con(values):
            return pa.array(values)
    else:

        def con(values):
            return pa.chunked_array([values])

    arr = con([1, 2, 3, None])
    scalar = pa.scalar(2)

    result = pc.equal(arr, scalar)
    assert result.equals(con([False, True, False, None]))

    if typ == "array":
        nascalar = pa.scalar(None, type="int64")
        result = pc.equal(arr, nascalar)
        assert result.to_pylist() == [None, None, None, None]

    result = pc.not_equal(arr, scalar)
    assert result.equals(con([True, False, True, None]))

    result = pc.less(arr, scalar)
    assert result.equals(con([True, False, False, None]))

    result = pc.less_equal(arr, scalar)
    assert result.equals(con([True, True, False, None]))

    result = pc.greater(arr, scalar)
    assert result.equals(con([False, False, True, None]))

    result = pc.greater_equal(arr, scalar)
    assert result.equals(con([False, True, True, None]))
def test_basics(value, ty, klass, deprecated):
    s = pa.scalar(value, type=ty)
    assert isinstance(s, klass)
    assert s.as_py() == value
    assert s == pa.scalar(value, type=ty)
    assert s != value
    assert s != "else"
    assert hash(s) == hash(s)
    assert s.is_valid is True
    assert s != None  # noqa: E711
    with pytest.warns(FutureWarning):
        assert isinstance(s, deprecated)

    s = pa.scalar(None, type=s.type)
    assert s.is_valid is False
    assert s.as_py() is None
    assert s != pa.scalar(value, type=ty)

    # test pickle roundtrip
    restored = pickle.loads(pickle.dumps(s))
    assert s.equals(restored)

    # test that scalars are weak-referenceable
    wr = weakref.ref(s)
    assert wr() is not None
    del s
    assert wr() is None
Exemple #8
0
def test_compare_string_scalar(typ):
    if typ == "array":
        def con(values): return pa.array(values)
    else:
        def con(values): return pa.chunked_array([values])

    arr = con(['a', 'b', 'c', None])
    scalar = pa.scalar('b')

    result = pc.equal(arr, scalar)
    assert result.equals(con([False, True, False, None]))

    if typ == "array":
        nascalar = pa.scalar(None, type="string")
        result = pc.equal(arr, nascalar)
        isnull = pc.is_null(result)
        assert isnull.equals(con([True, True, True, True]))

    result = pc.not_equal(arr, scalar)
    assert result.equals(con([True, False, True, None]))

    result = pc.less(arr, scalar)
    assert result.equals(con([True, False, False, None]))

    result = pc.less_equal(arr, scalar)
    assert result.equals(con([True, True, False, None]))

    result = pc.greater(arr, scalar)
    assert result.equals(con([False, False, True, None]))

    result = pc.greater_equal(arr, scalar)
    assert result.equals(con([False, True, True, None]))
Exemple #9
0
    def get_data(self, selector: SeriesSelector, start_date: datetime,
                 end_date: datetime) -> pa.Table:
        """Read data in one of the predefined formats.

        The complete file will be loaded in an Arrow table during processing.
        """
        data = self.__read_all_data(selector)
        # pylint: disable=no-member
        on_or_after = pyarrow.compute.greater_equal(data["ts"],
                                                    pa.scalar(start_date))
        before = pyarrow.compute.less(data["ts"], pa.scalar(end_date))
        return data.filter(pyarrow.compute.and_(on_or_after, before))
def test_index():
    arr = pa.array([0, 1, None, 3, 4], type=pa.int64())
    assert pc.index(arr, pa.scalar(0)).as_py() == 0
    assert pc.index(arr, pa.scalar(2, type=pa.int8())).as_py() == -1
    assert pc.index(arr, 4).as_py() == 4
    assert arr.index(3, start=2).as_py() == 3
    assert arr.index(None).as_py() == -1

    arr = pa.chunked_array([[1, 2], [1, 3]], type=pa.int64())
    assert arr.index(1).as_py() == 0
    assert arr.index(1, start=2).as_py() == 2
    assert arr.index(1, start=1, end=2).as_py() == -1
Exemple #11
0
def compute_functions_example():
	import pyarrow.compute

	a = pa.array([1, 1, 2, 3])
	pa.compute.sum(a)
	print("pa.compute.sum(a) = {}.".format(pa.compute.sum(a)))

	b = pa.array([4, 1, 2, 8])
	print("pa.compute.equal(a, b) = {}.".format(pa.compute.equal(a, b)))

	x, y = pa.scalar(7.8), pa.scalar(9.3)
	print("pa.compute.multiply(x, y) = {}.".format(pa.compute.multiply(x, y)))
Exemple #12
0
def check_cython_example_module(mod):
    arr = pa.array([1, 2, 3])
    assert mod.get_array_length(arr) == 3
    with pytest.raises(TypeError, match="not an array"):
        mod.get_array_length(None)

    scal = pa.scalar(123)
    cast_scal = mod.cast_scalar(scal, pa.utf8())
    assert cast_scal == pa.scalar("123")
    with pytest.raises(NotImplementedError,
                       match="casting scalars of type int64 to type list"):
        mod.cast_scalar(scal, pa.list_(pa.int64()))
Exemple #13
0
def test_nulls():
    null = pa.scalar(None)
    assert null is pa.NA
    assert null.as_py() is None
    assert null != "something"
    assert (null == pa.scalar(None)) is True
    assert (null == 0) is False
    assert pa.NA == pa.NA
    assert pa.NA not in [5]

    arr = pa.array([None, None])
    for v in arr:
        assert v is pa.NA
        assert v.as_py() is None
Exemple #14
0
def test_bool():
    false = pa.scalar(False)
    true = pa.scalar(True)

    assert isinstance(false, pa.BooleanScalar)
    assert isinstance(true, pa.BooleanScalar)

    assert repr(true) == "<pyarrow.BooleanScalar: True>"
    assert str(true) == "True"
    assert repr(false) == "<pyarrow.BooleanScalar: False>"
    assert str(false) == "False"

    assert true.as_py() is True
    assert false.as_py() is False
def test_date_cast():
    # ARROW-10472 - casting fo scalars doesn't segfault
    scalar = pa.scalar(datetime.datetime(2012, 1, 1), type=pa.timestamp("us"))
    expected = datetime.date(2012, 1, 1)
    for ty in [pa.date32(), pa.date64()]:
        result = scalar.cast(ty)
        assert result.as_py() == expected
Exemple #16
0
    def _preprocess_host_value(self, value, dtype):
        valid = not _is_null_host_scalar(value)

        if isinstance(dtype, Decimal64Dtype):
            value = pa.scalar(value,
                              type=pa.decimal128(dtype.precision,
                                                 dtype.scale)).as_py()
        if isinstance(value, decimal.Decimal) and dtype is None:
            dtype = Decimal64Dtype._from_decimal(value)

        value = to_cudf_compatible_scalar(value, dtype=dtype)

        if dtype is None:
            if not valid:
                if isinstance(value, (np.datetime64, np.timedelta64)):
                    unit, _ = np.datetime_data(value)
                    if unit == "generic":
                        raise TypeError(
                            "Cant convert generic NaT to null scalar")
                    else:
                        dtype = value.dtype
                else:
                    raise TypeError(
                        "dtype required when constructing a null scalar")
            else:
                dtype = value.dtype

        if not isinstance(dtype, Decimal64Dtype):
            dtype = np.dtype(dtype)

        if not valid:
            value = NA

        return value, dtype
Exemple #17
0
 def __ge__(self, other):
     if isinstance(other, self.__class__):
         f = lambda t: pa.compute.greater_equal(self.get(t), other.get(t))
         return self.breed('>=', other.key, f, required=other.required, boolean=True)
     else:
         f = lambda t: pa.compute.greater_equal(self.get(t), pa.scalar(other))
         return self.breed('>=', str(other), f, boolean=True)
Exemple #18
0
 def __truediv__(self, other):
     if isinstance(other, self.__class__):
         f = lambda t: pa.compute.divide(self.get(t), other.get(t))
         return self.breed('/', other.key, f, required=other.required)
     else:
         f = lambda t: pa.compute.divide(self.get(t), pa.scalar(other))
         return self.breed('/', str(other), f)
Exemple #19
0
def _parse_timestamp(value: str) -> pa.scalar:
    with warnings.catch_warnings():
        # numpy warns with DeprecationWarning when converting timezone offsets
        # to UTC. Even though converting timezone offsets to UTC is obviously
        # what everybody wants.
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        return pa.scalar(np.datetime64(value, "ns"), pa.timestamp("ns"))
Exemple #20
0
 def coalesce(self, other):
     if isinstance(other, self.__class__):
         f = lambda t: pa.array(np.where(np.isnan(self.get(t)), other.get(t), self.get(t)))
         return ColumnNode(key='coalesce({}, {})'.format(self.key, other.key), required=self.required + other.required, func=f, depth=self.depth + 1)
     else:
         f = lambda t: pa.compute.fill_null(self.get(t), pa.scalar(other))
         return ColumnNode(key='coalesce({}, {})'.format(self.key, other), required=self.required, func=f, depth=self.depth + 1)
def test_fill_null():
    arr = pa.array([1, 2, None, 4], type=pa.int8())
    fill_value = pa.array([5], type=pa.int8())
    with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"):
        arr.fill_null(fill_value)

    arr = pa.array([None, None, None, None], type=pa.null())
    fill_value = pa.scalar(None, type=pa.null())
    result = arr.fill_null(fill_value)
    expected = pa.array([None, None, None, None])
    assert result.equals(expected)

    arr = pa.array(['a', 'bb', None])
    result = arr.fill_null('ccc')
    expected = pa.array(['a', 'bb', 'ccc'])
    assert result.equals(expected)

    arr = pa.array([b'a', b'bb', None], type=pa.large_binary())
    result = arr.fill_null('ccc')
    expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary())
    assert result.equals(expected)

    arr = pa.array(['a', 'bb', None])
    result = arr.fill_null(None)
    expected = pa.array(['a', 'bb', None])
    assert result.equals(expected)
Exemple #22
0
def test_basics(value, ty, klass, deprecated):
    s = pa.scalar(value, type=ty)
    assert isinstance(s, klass)
    assert s.as_py() == value
    assert s == pa.scalar(value, type=ty)
    assert s != value
    assert s != "else"
    assert hash(s) == hash(s)
    assert s.is_valid is True
    with pytest.warns(FutureWarning):
        assert isinstance(s, deprecated)

    s = pa.scalar(None, type=s.type)
    assert s.is_valid is False
    assert s.as_py() is None
    assert s != pa.scalar(value, type=ty)
Exemple #23
0
 def __mul__(self, other):
     if isinstance(other, self.__class__):
         f = lambda t: pa.compute.multiply(self.get(t), other.get(t))
         return self.breed('*', other.key, f, required=other.required)
     else:
         f = lambda t: pa.compute.multiply(self.get(t), pa.scalar(other))
         return self.breed('*', str(other), f)
Exemple #24
0
    def _cmp_method(self, other, op):
        from pandas.arrays import BooleanArray

        pc_func = ARROW_CMP_FUNCS[op.__name__]
        if isinstance(other, ArrowExtensionArray):
            result = pc_func(self._data, other._data)
        elif isinstance(other, (np.ndarray, list)):
            result = pc_func(self._data, other)
        elif is_scalar(other):
            try:
                result = pc_func(self._data, pa.scalar(other))
            except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
                mask = isna(self) | isna(other)
                valid = ~mask
                result = np.zeros(len(self), dtype="bool")
                result[valid] = op(np.array(self)[valid], other)
                return BooleanArray(result, mask)
        else:
            return NotImplementedError(
                f"{op.__name__} not implemented for {type(other)}")

        if pa_version_under2p0:
            result = result.to_pandas().values
        else:
            result = result.to_numpy()
        return BooleanArray._from_sequence(result)
Exemple #25
0
    def isin(self, values):
        if pa_version_under2p0:
            return super().isin(values)

        value_set = [
            pa_scalar.as_py() for pa_scalar in
            [pa.scalar(value, from_pandas=True) for value in values]
            if pa_scalar.type in (pa.string(), pa.null())
        ]

        # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
        # for null values, so we short-circuit to return all False array.
        if not len(value_set):
            return np.zeros(len(self), dtype=bool)

        kwargs = {}
        if pa_version_under3p0:
            # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
            # with unexpected keyword argument in pyarrow 3.0.0+
            kwargs["skip_null"] = True

        result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs)
        # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
        # to False
        return np.array(result, dtype=np.bool_)
def test_ext_scalar_from_array():
    data = [
        b"0123456789abcdef", b"0123456789abcdef", b"zyxwvutsrqponmlk", None
    ]
    storage = pa.array(data, type=pa.binary(16))
    ty1 = UuidType()
    ty2 = ParamExtType(16)

    a = pa.ExtensionArray.from_storage(ty1, storage)
    b = pa.ExtensionArray.from_storage(ty2, storage)

    scalars_a = list(a)
    assert len(scalars_a) == 4

    for s, val in zip(scalars_a, data):
        assert isinstance(s, pa.ExtensionScalar)
        assert s.is_valid == (val is not None)
        assert s.type == ty1
        if val is not None:
            assert s.value == pa.scalar(val, storage.type)
        else:
            assert s.value is None
        assert s.as_py() == val

    scalars_b = list(b)
    assert len(scalars_b) == 4

    for sa, sb in zip(scalars_a, scalars_b):
        assert sa.is_valid == sb.is_valid
        assert sa.as_py() == sb.as_py()
        assert sa != sb
Exemple #27
0
 def fetch(self, verbose):
     t = self.parent.get(verbose)
     for c in self.nan_columns:
         arr = pa.compute.fill_null(
             t.column(c).combine_chunks(), pa.scalar(self.value))
         t = t.drop([c])
         t = t.append_column(c, arr)
     return t
Exemple #28
0
def test_decimal256():
    v = decimal.Decimal("1234567890123456789012345678901234567890.123")
    s = pa.scalar(v)
    assert isinstance(s, pa.Decimal256Scalar)
    assert s.as_py() == v
    assert s.type == pa.decimal256(43, 3)

    v = decimal.Decimal("1.1234")
    with pytest.raises(pa.ArrowInvalid):
        pa.scalar(v, type=pa.decimal256(4, scale=3))
    # TODO: Add the following after implementing Decimal256 scaling.
    # with pytest.raises(pa.ArrowInvalid):
    #     pa.scalar(v, type=pa.decimal256(5, scale=3))

    s = pa.scalar(v, type=pa.decimal256(5, scale=4))
    assert isinstance(s, pa.Decimal256Scalar)
    assert s.as_py() == v
Exemple #29
0
    def _preprocess_host_value(self, value, dtype):
        valid = not cudf._lib.scalar._is_null_host_scalar(value)

        if isinstance(value, list):
            if dtype is not None:
                raise TypeError("Lists may not be cast to a different dtype")
            else:
                dtype = ListDtype.from_arrow(
                    pa.infer_type([value], from_pandas=True))
                return value, dtype
        elif isinstance(dtype, ListDtype):
            if value not in {None, NA}:
                raise ValueError(f"Can not coerce {value} to ListDtype")
            else:
                return NA, dtype

        if isinstance(value, dict):
            if dtype is None:
                dtype = StructDtype.from_arrow(
                    pa.infer_type([value], from_pandas=True))
            return value, dtype
        elif isinstance(dtype, StructDtype):
            if value not in {None, NA}:
                raise ValueError(f"Can not coerce {value} to StructDType")
            else:
                return NA, dtype

        if isinstance(dtype, Decimal64Dtype):
            value = pa.scalar(value,
                              type=pa.decimal128(dtype.precision,
                                                 dtype.scale)).as_py()
        if isinstance(value, decimal.Decimal) and dtype is None:
            dtype = Decimal64Dtype._from_decimal(value)

        value = to_cudf_compatible_scalar(value, dtype=dtype)

        if dtype is None:
            if not valid:
                if isinstance(value, (np.datetime64, np.timedelta64)):
                    unit, _ = np.datetime_data(value)
                    if unit == "generic":
                        raise TypeError(
                            "Cant convert generic NaT to null scalar")
                    else:
                        dtype = value.dtype
                else:
                    raise TypeError(
                        "dtype required when constructing a null scalar")
            else:
                dtype = value.dtype

        if not isinstance(dtype, Decimal64Dtype):
            dtype = cudf.dtype(dtype)

        if not valid:
            value = NA

        return value, dtype
Exemple #30
0
def test_time():
    t1 = datetime.time(18, 0)
    t2 = datetime.time(21, 0)

    types = [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')]
    for ty in types:
        for t in [t1, t2]:
            s = pa.scalar(t, type=ty)
            assert s.as_py() == t