def mode(array, n=1, *, skip_nulls=True, min_count=0): """ Return top-n most common values and number of times they occur in a passed numerical (chunked) array, in descending order of occurrence. If there are multiple values with same count, the smaller one is returned first. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray n : int, default 1 Specify the top-n values. skip_nulls : bool, default True If True, ignore nulls in the input. Else return an empty array if any input is null. min_count : int, default 0 If there are fewer than this many values in the input, return an empty array. Returns ------- An array of <input type "Mode", int64_t "Count"> structs Examples -------- >>> import pyarrow as pa >>> import pyarrow.compute as pc >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) >>> modes = pc.mode(arr, 2) >>> modes[0] <pyarrow.StructScalar: {'mode': 2, 'count': 5}> >>> modes[1] <pyarrow.StructScalar: {'mode': 1, 'count': 2}> """ options = ModeOptions(n, skip_nulls=skip_nulls, min_count=min_count) return call_function("mode", [array], options)
def index(data, value, start=None, end=None, *, memory_pool=None): """ Find the index of the first occurrence of a given value. Parameters ---------- data : Array or ChunkedArray value : Scalar-like object start : int, optional end : int, optional Returns ------- index : the index, or -1 if not found """ if start is not None: if end is not None: data = data.slice(start, end - start) else: data = data.slice(start) elif end is not None: data = data.slice(0, end) if not isinstance(value, pa.Scalar): value = pa.scalar(value, type=data.type) elif data.type != value.type: value = pa.scalar(value.as_py(), type=data.type) options = IndexOptions(value=value) result = call_function('index', [data], options, memory_pool) if start is not None and result.as_py() >= 0: result = pa.scalar(result.as_py() + start, type=pa.int64()) return result
def mode(array, n=1): """ Return top-n most common values and number of times they occur in a passed numerical (chunked) array, in descending order of occurance. If there are more than one values with same count, smaller one is returned first. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray Returns ------- An array of <input type "Mode", int64_t "Count"> structs Examples -------- >>> import pyarrow as pa >>> import pyarrow.compute as pc >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) >>> modes = pc.mode(arr, 2) >>> modes[0] <pyarrow.StructScalar: {'mode': 2, 'count': 5}> >>> modes[1] <pyarrow.StructScalar: {'mode': 1, 'count': 2}> """ options = ModeOptions(n=n) return call_function("mode", [array], options)
def take(data, indices): """ Select values (or records) from array- or table-like data given integer selection indices. The result will be of the same type(s) as the input, with elements taken from the input array (or record batch / table fields) at the given indices. If an index is null then the corresponding value in the output will be null. Parameters ---------- data : Array, ChunkedArray, RecordBatch, or Table indices : Array, ChunkedArray Must be of integer type Returns ------- result : depends on inputs Examples -------- >>> import pyarrow as pa >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) >>> indices = pa.array([0, None, 4, 3]) >>> arr.take(indices) <pyarrow.lib.StringArray object at 0x7ffa4fc7d368> [ "a", null, "e", null ] """ return call_function('take', [data, indices])
def cast(arr, target_type, safe=True): """ Cast array values to another data type. Can also be invoked as an array instance method. Parameters ---------- arr : Array or ChunkedArray target_type : DataType or type string alias Type to cast to safe : bool, default True Check for overflows or other unsafe conversions Examples -------- >>> from datetime import datetime >>> import pyarrow as pa >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) >>> arr.type TimestampType(timestamp[us]) You can use ``pyarrow.DataType`` objects to specify the target type: >>> cast(arr, pa.timestamp('ms')) <pyarrow.lib.TimestampArray object at 0x7fe93c0f6910> [ 2010-01-01 00:00:00.000, 2015-01-01 00:00:00.000 ] >>> cast(arr, pa.timestamp('ms')).type TimestampType(timestamp[ms]) Alternatively, it is also supported to use the string aliases for these types: >>> arr.cast('timestamp[ms]') <pyarrow.lib.TimestampArray object at 0x10420eb88> [ 1262304000000, 1420070400000 ] >>> arr.cast('timestamp[ms]').type TimestampType(timestamp[ms]) Returns ------- casted : Array """ if target_type is None: raise ValueError("Cast target type must not be None") if safe: options = _pc.CastOptions.safe(target_type) else: options = _pc.CastOptions.unsafe(target_type) return call_function("cast", [arr], options)
def sum(array): """ Sum the values in a numerical (chunked) array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray Returns ------- sum : pyarrow.Scalar """ return call_function('sum', [array])
def binary_contains_exact(array, pattern): """ Test if pattern is contained within a value of a binary array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str pattern to search for exact matches Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("binary_contains_exact", [array], _pc.BinaryContainsExactOptions(pattern))
def match_substring_regex(array, pattern): """ Test if regex *pattern* matches at any position a value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str regex pattern to search Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("match_substring_regex", [array], MatchSubstringOptions(pattern))
def match_substring(array, pattern): """ Test if substring *pattern* is contained within a value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str pattern to search for exact matches Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("match_substring", [array], MatchSubstringOptions(pattern))
def count_substring_regex(array, pattern, *, ignore_case=False): """ Count the non-overlapping matches of regex *pattern* in each value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str pattern to search for exact matches Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("count_substring_regex", [array], MatchSubstringOptions(pattern, ignore_case))
def find_substring(array, pattern): """ Find the index of the first occurrence of substring *pattern* in each value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str pattern to search for exact matches Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("find_substring", [array], MatchSubstringOptions(pattern))
def count_substring(array, pattern): """ Count the occurrences of substring *pattern* in each value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str pattern to search for exact matches Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("count_substring", [array], MatchSubstringOptions(pattern))
def match_substring(array, pattern, *, ignore_case=False): """ Test if substring *pattern* is contained within a value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str pattern to search for exact matches ignore_case : bool, default False Ignore case while searching. Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("match_substring", [array], MatchSubstringOptions(pattern, ignore_case))
def match_substring_regex(array, pattern, *, ignore_case=False): """ Test if regex *pattern* matches at any position a value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str regex pattern to search ignore_case : bool, default False Ignore case while searching. Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("match_substring_regex", [array], MatchSubstringOptions(pattern, ignore_case))
def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None): """ Select the indices of the bottom-k ordered elements from array- or table-like data. This is a specialization for :func:`select_k_unstable`. Output is not guaranteed to be stable. Parameters ---------- values : Array, ChunkedArray, RecordBatch, or Table Data to sort and get bottom indices from. k : int The number of `k` elements to keep. sort_keys : List-like Column key names to order by when input is table-like data. memory_pool : MemoryPool, optional If not passed, will allocate memory from the default memory pool. Returns ------- result : Array of indices Examples -------- >>> import pyarrow as pa >>> import pyarrow.compute as pc >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) >>> pc.bottom_k_unstable(arr, k=3) <pyarrow.lib.UInt64Array object at 0x7fdcb19d7fa0> [ 0, 1, 2 ] """ if sort_keys is None: sort_keys = [] if isinstance(values, (pa.Array, pa.ChunkedArray)): sort_keys.append(("dummy", "ascending")) else: sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys) options = SelectKOptions(k, sort_keys) return call_function("select_k_unstable", [values], options, memory_pool)
def find_substring_regex(array, pattern, *, ignore_case=False): """ Find the index of the first match of regex *pattern* in each value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str regex pattern to search for ignore_case : bool, default False Ignore case while searching. Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("find_substring_regex", [array], MatchSubstringOptions(pattern, ignore_case))
def filter(data, mask, null_selection_behavior='drop'): """ Select values (or records) from array- or table-like data given boolean filter, where true values are selected. Parameters ---------- data : Array, ChunkedArray, RecordBatch, or Table mask : Array, ChunkedArray Must be of boolean type null_selection_behavior : str, default 'drop' Configure the behavior on encountering a null slot in the mask. Allowed values are 'drop' and 'emit_null'. - 'drop': nulls will be treated as equivalent to False. - 'emit_null': nulls will result in a null in the output. Returns ------- result : depends on inputs Examples -------- >>> import pyarrow as pa >>> arr = pa.array(["a", "b", "c", None, "e"]) >>> mask = pa.array([True, False, None, False, True]) >>> arr.filter(mask) <pyarrow.lib.StringArray object at 0x7fa826df9200> [ "a", "e" ] >>> arr.filter(mask, null_selection_behavior='emit_null') <pyarrow.lib.StringArray object at 0x7fa826df9200> [ "a", null, "e" ] """ options = FilterOptions(null_selection_behavior) return call_function('filter', [data, mask], options)
def count_substring(array, pattern, *, ignore_case=False): """ Count the occurrences of substring *pattern* in each value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str pattern to search for exact matches ignore_case : bool, default False Ignore case while searching. Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("count_substring", [array], MatchSubstringOptions(pattern, ignore_case=ignore_case))
def match_like(array, pattern): """ Test if the SQL-style LIKE pattern *pattern* matches a value of a string array. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray pattern : str SQL-style LIKE pattern. '%' will match any number of characters, '_' will match exactly one character, and all other characters match themselves. To match a literal percent sign or underscore, precede the character with a backslash. Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("match_like", [array], MatchSubstringOptions(pattern))
def take(data, indices, *, boundscheck=True, memory_pool=None): """ Select values (or records) from array- or table-like data given integer selection indices. The result will be of the same type(s) as the input, with elements taken from the input array (or record batch / table fields) at the given indices. If an index is null then the corresponding value in the output will be null. Parameters ---------- data : Array, ChunkedArray, RecordBatch, or Table indices : Array, ChunkedArray Must be of integer type boundscheck : boolean, default True Whether to boundscheck the indices. If False and there is an out of bounds index, will likely cause the process to crash. memory_pool : MemoryPool, optional If not passed, will allocate memory from the default memory pool. Returns ------- result : depends on inputs Examples -------- >>> import pyarrow as pa >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) >>> indices = pa.array([0, None, 4, 3]) >>> arr.take(indices) <pyarrow.lib.StringArray object at 0x7ffa4fc7d368> [ "a", null, "e", null ] """ options = TakeOptions(boundscheck=boundscheck) return call_function('take', [data, indices], options, memory_pool)
def fill_null(values, fill_value): """ Replace each null element in values with fill_value. The fill_value must be the same type as values or able to be implicitly casted to the array's type. This is an alias for :func:`coalesce`. Parameters ---------- values : Array, ChunkedArray, or Scalar-like object Each null element is replaced with the corresponding value from fill_value. fill_value : Array, ChunkedArray, or Scalar-like object If not same type as data will attempt to cast. Returns ------- result : depends on inputs Examples -------- >>> import pyarrow as pa >>> arr = pa.array([1, 2, None, 3], type=pa.int8()) >>> fill_value = pa.scalar(5, type=pa.int8()) >>> arr.fill_null(fill_value) pyarrow.lib.Int8Array object at 0x7f95437f01a0> [ 1, 2, 5, 3 ] """ if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)): fill_value = pa.scalar(fill_value, type=values.type) elif values.type != fill_value.type: fill_value = pa.scalar(fill_value.as_py(), type=values.type) return call_function("coalesce", [values, fill_value])
def fill_null(values, fill_value): """ Replace each null element in values with fill_value. The fill_value must be the same type as values or able to be implicitly casted to the array's type. Parameters ---------- data : Array, ChunkedArray replace each null element with fill_value fill_value: Scalar-like object Either a pyarrow.Scalar or any python object coercible to a Scalar. If not same type as data will attempt to cast. Returns ------- result : depends on inputs Examples -------- >>> import pyarrow as pa >>> arr = pa.array([1, 2, None, 3], type=pa.int8()) >>> fill_value = pa.scalar(5, type=pa.int8()) >>> arr.fill_null(fill_value) pyarrow.lib.Int8Array object at 0x7f95437f01a0> [ 1, 2, 5, 3 ] """ if not isinstance(fill_value, pa.Scalar): fill_value = pa.scalar(fill_value, type=values.type) elif values.type != fill_value.type: fill_value = pa.scalar(fill_value.as_py(), type=values.type) return call_function("fill_null", [values, fill_value])
def random(n, *, initializer='system', options=None, memory_pool=None): """ Generate numbers in the range [0, 1). Generated values are uniformly-distributed, double-precision in range [0, 1). Algorithm and seed can be changed via RandomOptions. Parameters ---------- n : int Number of values to generate, must be greater than or equal to 0 initializer : int or str How to initialize the underlying random generator. If an integer is given, it is used as a seed. If "system" is given, the random generator is initialized with a system-specific source of (hopefully true) randomness. Other values are invalid. options : pyarrow.compute.RandomOptions, optional Alternative way of passing options. memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """ options = RandomOptions(initializer=initializer) return call_function("random", [], options, memory_pool, length=n)
def mode(array): """ Return the mode (most common value) of a passed numerical (chunked) array. If there is more than one such value, only the smallest is returned. Parameters ---------- array : pyarrow.Array or pyarrow.ChunkedArray Returns ------- mode : pyarrow.StructScalar Examples -------- >>> import pyarrow as pa >>> import pyarrow.compute as pc >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) >>> pc.mode(arr) <pyarrow.StructScalar: {'mode': 2, 'count': 5}> """ return call_function("mode", [array])
def index(data, value, start=None, end=None, *, memory_pool=None): """ Find the index of the first occurrence of a given value. Parameters ---------- data : Array-like value : Scalar-like object The value to search for. start : int, optional end : int, optional memory_pool : MemoryPool, optional If not passed, will allocate memory from the default memory pool. Returns ------- index : int the index, or -1 if not found """ if start is not None: if end is not None: data = data.slice(start, end - start) else: data = data.slice(start) elif end is not None: data = data.slice(0, end) if not isinstance(value, pa.Scalar): value = pa.scalar(value, type=data.type) elif data.type != value.type: value = pa.scalar(value.as_py(), type=data.type) options = IndexOptions(value=value) result = call_function('index', [data], options, memory_pool) if start is not None and result.as_py() >= 0: result = pa.scalar(result.as_py() + start, type=pa.int64()) return result
def func(arg): return call_function(name, [arg])
def func(left, right): return call_function(name, [left, right])
def cast(arr, target_type=None, safe=None, options=None): """ Cast array values to another data type. Can also be invoked as an array instance method. Parameters ---------- arr : Array-like target_type : DataType or str Type to cast to safe : bool, default True Check for overflows or other unsafe conversions options : CastOptions, default None Additional checks pass by CastOptions Examples -------- >>> from datetime import datetime >>> import pyarrow as pa >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) >>> arr.type TimestampType(timestamp[us]) You can use ``pyarrow.DataType`` objects to specify the target type: >>> cast(arr, pa.timestamp('ms')) <pyarrow.lib.TimestampArray object at ...> [ 2010-01-01 00:00:00.000, 2015-01-01 00:00:00.000 ] >>> cast(arr, pa.timestamp('ms')).type TimestampType(timestamp[ms]) Alternatively, it is also supported to use the string aliases for these types: >>> arr.cast('timestamp[ms]') <pyarrow.lib.TimestampArray object at ...> [ 2010-01-01 00:00:00.000, 2015-01-01 00:00:00.000 ] >>> arr.cast('timestamp[ms]').type TimestampType(timestamp[ms]) Returns ------- casted : Array """ safe_vars_passed = (safe is not None) or (target_type is not None) if safe_vars_passed and (options is not None): raise ValueError("Must either pass values for 'target_type' and 'safe'" " or pass a value for 'options'") if options is None: target_type = pa.types.lib.ensure_type(target_type) if safe is False: options = CastOptions.unsafe(target_type) else: options = CastOptions.safe(target_type) return call_function("cast", [arr], options)