def test_map(): ty = pa.map_(pa.string(), pa.int8()) v = [('a', 1), ('b', 2)] s = pa.scalar(v, type=ty) assert len(s) == 2 assert isinstance(s, pa.MapScalar) assert isinstance(s.values, pa.Array) assert repr(s) == "<pyarrow.MapScalar: [('a', 1), ('b', 2)]>" assert s.values.to_pylist() == [{ 'key': 'a', 'value': 1 }, { 'key': 'b', 'value': 2 }] # test iteration for i, j in zip(s, v): assert i == j assert s.as_py() == v assert s[1] == (pa.scalar('b', type=pa.string()), pa.scalar(2, type=pa.int8())) assert s[-1] == s[1] assert s[-2] == s[0] with pytest.raises(IndexError): s[-3] with pytest.raises(IndexError): s[2] restored = pickle.loads(pickle.dumps(s)) assert restored.equals(s)
def test_fill_null_chunked_array(arrow_type): fill_value = pa.scalar(5, type=arrow_type) arr = pa.chunked_array([pa.array([None, 2, 3, 4], type=arrow_type)]) result = arr.fill_null(fill_value) expected = pa.chunked_array([pa.array([5, 2, 3, 4], type=arrow_type)]) assert result.equals(expected) arr = pa.chunked_array([ pa.array([1, 2], type=arrow_type), pa.array([], type=arrow_type), pa.array([None, 4], type=arrow_type) ]) expected = pa.chunked_array([ pa.array([1, 2], type=arrow_type), pa.array([], type=arrow_type), pa.array([5, 4], type=arrow_type) ]) result = arr.fill_null(fill_value) assert result.equals(expected) # Implicit conversions result = arr.fill_null(5) assert result.equals(expected) result = arr.fill_null(pa.scalar(5, type='int8')) assert result.equals(expected)
def test_fixed_size_binary(): s = pa.scalar(b'foof', type=pa.binary(4)) assert isinstance(s, pa.FixedSizeBinaryScalar) assert s.as_py() == b'foof' with pytest.raises(pa.ArrowInvalid): pa.scalar(b'foof5', type=pa.binary(4))
def index(data, value, start=None, end=None, *, memory_pool=None): """ Find the index of the first occurrence of a given value. Parameters ---------- data : Array or ChunkedArray value : Scalar-like object start : int, optional end : int, optional memory_pool : MemoryPool, optional If not passed, will allocate memory from the default memory pool. Returns ------- index : the index, or -1 if not found """ if start is not None: if end is not None: data = data.slice(start, end - start) else: data = data.slice(start) elif end is not None: data = data.slice(0, end) if not isinstance(value, pa.Scalar): value = pa.scalar(value, type=data.type) elif data.type != value.type: value = pa.scalar(value.as_py(), type=data.type) options = IndexOptions(value=value) result = call_function('index', [data], options, memory_pool) if start is not None and result.as_py() >= 0: result = pa.scalar(result.as_py() + start, type=pa.int64()) return result
def test_ext_scalar_from_storage(): ty = UuidType() s = pa.ExtensionScalar.from_storage(ty, None) assert isinstance(s, pa.ExtensionScalar) assert s.type == ty assert s.is_valid is False assert s.value is None s = pa.ExtensionScalar.from_storage(ty, b"0123456789abcdef") assert isinstance(s, pa.ExtensionScalar) assert s.type == ty assert s.is_valid is True assert s.value == pa.scalar(b"0123456789abcdef", ty.storage_type) s = pa.ExtensionScalar.from_storage(ty, pa.scalar(None, ty.storage_type)) assert isinstance(s, pa.ExtensionScalar) assert s.type == ty assert s.is_valid is False assert s.value is None s = pa.ExtensionScalar.from_storage( ty, pa.scalar(b"0123456789abcdef", ty.storage_type)) assert isinstance(s, pa.ExtensionScalar) assert s.type == ty assert s.is_valid is True assert s.value == pa.scalar(b"0123456789abcdef", ty.storage_type)
def test_compare_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con([1, 2, 3, None]) scalar = pa.scalar(2) result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="int64") result = pc.equal(arr, nascalar) assert result.to_pylist() == [None, None, None, None] result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def test_basics(value, ty, klass, deprecated): s = pa.scalar(value, type=ty) assert isinstance(s, klass) assert s.as_py() == value assert s == pa.scalar(value, type=ty) assert s != value assert s != "else" assert hash(s) == hash(s) assert s.is_valid is True assert s != None # noqa: E711 with pytest.warns(FutureWarning): assert isinstance(s, deprecated) s = pa.scalar(None, type=s.type) assert s.is_valid is False assert s.as_py() is None assert s != pa.scalar(value, type=ty) # test pickle roundtrip restored = pickle.loads(pickle.dumps(s)) assert s.equals(restored) # test that scalars are weak-referenceable wr = weakref.ref(s) assert wr() is not None del s assert wr() is None
def test_compare_string_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con(['a', 'b', 'c', None]) scalar = pa.scalar('b') result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="string") result = pc.equal(arr, nascalar) isnull = pc.is_null(result) assert isnull.equals(con([True, True, True, True])) result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def get_data(self, selector: SeriesSelector, start_date: datetime, end_date: datetime) -> pa.Table: """Read data in one of the predefined formats. The complete file will be loaded in an Arrow table during processing. """ data = self.__read_all_data(selector) # pylint: disable=no-member on_or_after = pyarrow.compute.greater_equal(data["ts"], pa.scalar(start_date)) before = pyarrow.compute.less(data["ts"], pa.scalar(end_date)) return data.filter(pyarrow.compute.and_(on_or_after, before))
def test_index(): arr = pa.array([0, 1, None, 3, 4], type=pa.int64()) assert pc.index(arr, pa.scalar(0)).as_py() == 0 assert pc.index(arr, pa.scalar(2, type=pa.int8())).as_py() == -1 assert pc.index(arr, 4).as_py() == 4 assert arr.index(3, start=2).as_py() == 3 assert arr.index(None).as_py() == -1 arr = pa.chunked_array([[1, 2], [1, 3]], type=pa.int64()) assert arr.index(1).as_py() == 0 assert arr.index(1, start=2).as_py() == 2 assert arr.index(1, start=1, end=2).as_py() == -1
def compute_functions_example(): import pyarrow.compute a = pa.array([1, 1, 2, 3]) pa.compute.sum(a) print("pa.compute.sum(a) = {}.".format(pa.compute.sum(a))) b = pa.array([4, 1, 2, 8]) print("pa.compute.equal(a, b) = {}.".format(pa.compute.equal(a, b))) x, y = pa.scalar(7.8), pa.scalar(9.3) print("pa.compute.multiply(x, y) = {}.".format(pa.compute.multiply(x, y)))
def check_cython_example_module(mod): arr = pa.array([1, 2, 3]) assert mod.get_array_length(arr) == 3 with pytest.raises(TypeError, match="not an array"): mod.get_array_length(None) scal = pa.scalar(123) cast_scal = mod.cast_scalar(scal, pa.utf8()) assert cast_scal == pa.scalar("123") with pytest.raises(NotImplementedError, match="casting scalars of type int64 to type list"): mod.cast_scalar(scal, pa.list_(pa.int64()))
def test_nulls(): null = pa.scalar(None) assert null is pa.NA assert null.as_py() is None assert null != "something" assert (null == pa.scalar(None)) is True assert (null == 0) is False assert pa.NA == pa.NA assert pa.NA not in [5] arr = pa.array([None, None]) for v in arr: assert v is pa.NA assert v.as_py() is None
def test_bool(): false = pa.scalar(False) true = pa.scalar(True) assert isinstance(false, pa.BooleanScalar) assert isinstance(true, pa.BooleanScalar) assert repr(true) == "<pyarrow.BooleanScalar: True>" assert str(true) == "True" assert repr(false) == "<pyarrow.BooleanScalar: False>" assert str(false) == "False" assert true.as_py() is True assert false.as_py() is False
def test_date_cast(): # ARROW-10472 - casting fo scalars doesn't segfault scalar = pa.scalar(datetime.datetime(2012, 1, 1), type=pa.timestamp("us")) expected = datetime.date(2012, 1, 1) for ty in [pa.date32(), pa.date64()]: result = scalar.cast(ty) assert result.as_py() == expected
def _preprocess_host_value(self, value, dtype): valid = not _is_null_host_scalar(value) if isinstance(dtype, Decimal64Dtype): value = pa.scalar(value, type=pa.decimal128(dtype.precision, dtype.scale)).as_py() if isinstance(value, decimal.Decimal) and dtype is None: dtype = Decimal64Dtype._from_decimal(value) value = to_cudf_compatible_scalar(value, dtype=dtype) if dtype is None: if not valid: if isinstance(value, (np.datetime64, np.timedelta64)): unit, _ = np.datetime_data(value) if unit == "generic": raise TypeError( "Cant convert generic NaT to null scalar") else: dtype = value.dtype else: raise TypeError( "dtype required when constructing a null scalar") else: dtype = value.dtype if not isinstance(dtype, Decimal64Dtype): dtype = np.dtype(dtype) if not valid: value = NA return value, dtype
def __ge__(self, other): if isinstance(other, self.__class__): f = lambda t: pa.compute.greater_equal(self.get(t), other.get(t)) return self.breed('>=', other.key, f, required=other.required, boolean=True) else: f = lambda t: pa.compute.greater_equal(self.get(t), pa.scalar(other)) return self.breed('>=', str(other), f, boolean=True)
def __truediv__(self, other): if isinstance(other, self.__class__): f = lambda t: pa.compute.divide(self.get(t), other.get(t)) return self.breed('/', other.key, f, required=other.required) else: f = lambda t: pa.compute.divide(self.get(t), pa.scalar(other)) return self.breed('/', str(other), f)
def _parse_timestamp(value: str) -> pa.scalar: with warnings.catch_warnings(): # numpy warns with DeprecationWarning when converting timezone offsets # to UTC. Even though converting timezone offsets to UTC is obviously # what everybody wants. warnings.filterwarnings("ignore", category=DeprecationWarning) return pa.scalar(np.datetime64(value, "ns"), pa.timestamp("ns"))
def coalesce(self, other): if isinstance(other, self.__class__): f = lambda t: pa.array(np.where(np.isnan(self.get(t)), other.get(t), self.get(t))) return ColumnNode(key='coalesce({}, {})'.format(self.key, other.key), required=self.required + other.required, func=f, depth=self.depth + 1) else: f = lambda t: pa.compute.fill_null(self.get(t), pa.scalar(other)) return ColumnNode(key='coalesce({}, {})'.format(self.key, other), required=self.required, func=f, depth=self.depth + 1)
def test_fill_null(): arr = pa.array([1, 2, None, 4], type=pa.int8()) fill_value = pa.array([5], type=pa.int8()) with pytest.raises(pa.ArrowInvalid, match="tried to convert to int"): arr.fill_null(fill_value) arr = pa.array([None, None, None, None], type=pa.null()) fill_value = pa.scalar(None, type=pa.null()) result = arr.fill_null(fill_value) expected = pa.array([None, None, None, None]) assert result.equals(expected) arr = pa.array(['a', 'bb', None]) result = arr.fill_null('ccc') expected = pa.array(['a', 'bb', 'ccc']) assert result.equals(expected) arr = pa.array([b'a', b'bb', None], type=pa.large_binary()) result = arr.fill_null('ccc') expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary()) assert result.equals(expected) arr = pa.array(['a', 'bb', None]) result = arr.fill_null(None) expected = pa.array(['a', 'bb', None]) assert result.equals(expected)
def test_basics(value, ty, klass, deprecated): s = pa.scalar(value, type=ty) assert isinstance(s, klass) assert s.as_py() == value assert s == pa.scalar(value, type=ty) assert s != value assert s != "else" assert hash(s) == hash(s) assert s.is_valid is True with pytest.warns(FutureWarning): assert isinstance(s, deprecated) s = pa.scalar(None, type=s.type) assert s.is_valid is False assert s.as_py() is None assert s != pa.scalar(value, type=ty)
def __mul__(self, other): if isinstance(other, self.__class__): f = lambda t: pa.compute.multiply(self.get(t), other.get(t)) return self.breed('*', other.key, f, required=other.required) else: f = lambda t: pa.compute.multiply(self.get(t), pa.scalar(other)) return self.breed('*', str(other), f)
def _cmp_method(self, other, op): from pandas.arrays import BooleanArray pc_func = ARROW_CMP_FUNCS[op.__name__] if isinstance(other, ArrowExtensionArray): result = pc_func(self._data, other._data) elif isinstance(other, (np.ndarray, list)): result = pc_func(self._data, other) elif is_scalar(other): try: result = pc_func(self._data, pa.scalar(other)) except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): mask = isna(self) | isna(other) valid = ~mask result = np.zeros(len(self), dtype="bool") result[valid] = op(np.array(self)[valid], other) return BooleanArray(result, mask) else: return NotImplementedError( f"{op.__name__} not implemented for {type(other)}") if pa_version_under2p0: result = result.to_pandas().values else: result = result.to_numpy() return BooleanArray._from_sequence(result)
def isin(self, values): if pa_version_under2p0: return super().isin(values) value_set = [ pa_scalar.as_py() for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] if pa_scalar.type in (pa.string(), pa.null()) ] # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True # for null values, so we short-circuit to return all False array. if not len(value_set): return np.zeros(len(self), dtype=bool) kwargs = {} if pa_version_under3p0: # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises # with unexpected keyword argument in pyarrow 3.0.0+ kwargs["skip_null"] = True result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_)
def test_ext_scalar_from_array(): data = [ b"0123456789abcdef", b"0123456789abcdef", b"zyxwvutsrqponmlk", None ] storage = pa.array(data, type=pa.binary(16)) ty1 = UuidType() ty2 = ParamExtType(16) a = pa.ExtensionArray.from_storage(ty1, storage) b = pa.ExtensionArray.from_storage(ty2, storage) scalars_a = list(a) assert len(scalars_a) == 4 for s, val in zip(scalars_a, data): assert isinstance(s, pa.ExtensionScalar) assert s.is_valid == (val is not None) assert s.type == ty1 if val is not None: assert s.value == pa.scalar(val, storage.type) else: assert s.value is None assert s.as_py() == val scalars_b = list(b) assert len(scalars_b) == 4 for sa, sb in zip(scalars_a, scalars_b): assert sa.is_valid == sb.is_valid assert sa.as_py() == sb.as_py() assert sa != sb
def fetch(self, verbose): t = self.parent.get(verbose) for c in self.nan_columns: arr = pa.compute.fill_null( t.column(c).combine_chunks(), pa.scalar(self.value)) t = t.drop([c]) t = t.append_column(c, arr) return t
def test_decimal256(): v = decimal.Decimal("1234567890123456789012345678901234567890.123") s = pa.scalar(v) assert isinstance(s, pa.Decimal256Scalar) assert s.as_py() == v assert s.type == pa.decimal256(43, 3) v = decimal.Decimal("1.1234") with pytest.raises(pa.ArrowInvalid): pa.scalar(v, type=pa.decimal256(4, scale=3)) # TODO: Add the following after implementing Decimal256 scaling. # with pytest.raises(pa.ArrowInvalid): # pa.scalar(v, type=pa.decimal256(5, scale=3)) s = pa.scalar(v, type=pa.decimal256(5, scale=4)) assert isinstance(s, pa.Decimal256Scalar) assert s.as_py() == v
def _preprocess_host_value(self, value, dtype): valid = not cudf._lib.scalar._is_null_host_scalar(value) if isinstance(value, list): if dtype is not None: raise TypeError("Lists may not be cast to a different dtype") else: dtype = ListDtype.from_arrow( pa.infer_type([value], from_pandas=True)) return value, dtype elif isinstance(dtype, ListDtype): if value not in {None, NA}: raise ValueError(f"Can not coerce {value} to ListDtype") else: return NA, dtype if isinstance(value, dict): if dtype is None: dtype = StructDtype.from_arrow( pa.infer_type([value], from_pandas=True)) return value, dtype elif isinstance(dtype, StructDtype): if value not in {None, NA}: raise ValueError(f"Can not coerce {value} to StructDType") else: return NA, dtype if isinstance(dtype, Decimal64Dtype): value = pa.scalar(value, type=pa.decimal128(dtype.precision, dtype.scale)).as_py() if isinstance(value, decimal.Decimal) and dtype is None: dtype = Decimal64Dtype._from_decimal(value) value = to_cudf_compatible_scalar(value, dtype=dtype) if dtype is None: if not valid: if isinstance(value, (np.datetime64, np.timedelta64)): unit, _ = np.datetime_data(value) if unit == "generic": raise TypeError( "Cant convert generic NaT to null scalar") else: dtype = value.dtype else: raise TypeError( "dtype required when constructing a null scalar") else: dtype = value.dtype if not isinstance(dtype, Decimal64Dtype): dtype = cudf.dtype(dtype) if not valid: value = NA return value, dtype
def test_time(): t1 = datetime.time(18, 0) t2 = datetime.time(21, 0) types = [pa.time32('s'), pa.time32('ms'), pa.time64('us'), pa.time64('ns')] for ty in types: for t in [t1, t2]: s = pa.scalar(t, type=ty) assert s.as_py() == t