def test_compare_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con([1, 2, 3, None]) scalar = pa.scalar(2) result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="int64") result = pc.equal(arr, nascalar) assert result.to_pylist() == [None, None, None, None] result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def test_compare_string_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con(['a', 'b', 'c', None]) scalar = pa.scalar('b') result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="string") result = pc.equal(arr, nascalar) isnull = pc.is_null(result) assert isnull.equals(con([True, True, True, True])) result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def clean_hot(arr, categories=[], drop_first=False): arr = arr.cast(pa.string()) if categories: clns = [c.equal(arr, v).fill_null(False) for v in categories] else: categories = [u for u in arr.unique().to_pylist() if u] clns = [c.equal(arr, v).fill_null(False) for v in categories] return clns[(1 if drop_first else 0):], categories[( 1 if drop_first else 0):]
def test_compare_array(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr1 = con([1, 2, 3, 4, None]) arr2 = con([1, 1, 4, None, 4]) result = pc.equal(arr1, arr2) assert result.equals(con([True, False, False, None, None])) result = pc.not_equal(arr1, arr2) assert result.equals(con([False, True, True, None, None])) result = pc.less(arr1, arr2) assert result.equals(con([False, False, True, None, None])) result = pc.less_equal(arr1, arr2) assert result.equals(con([True, False, True, None, None])) result = pc.greater(arr1, arr2) assert result.equals(con([False, True, False, None, None])) result = pc.greater_equal(arr1, arr2) assert result.equals(con([True, True, False, None, None]))
def test_compare_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con([1, 2, 3, None]) # TODO this is a hacky way to construct a scalar .. scalar = pa.array([2]).sum() result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def pyarrow_transform(batch: pa.Table) -> pa.Table: batch = batch.filter(pac.equal(batch["variety"], "Versicolor")) batch = batch.append_column( "normalized.sepal.length", pac.divide(batch["sepal.length"], pac.max(batch["sepal.length"])), ) return batch.drop(["sepal.length"])
def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT: """ Returns the mode(s) of the ExtensionArray. Always returns `ExtensionArray` even if only one value. Parameters ---------- dropna : bool, default True Don't consider counts of NA values. Not implemented by pyarrow. Returns ------- same type as self Sorted, if possible. """ if pa_version_under6p0: raise NotImplementedError( "mode only supported for pyarrow version >= 6.0") modes = pc.mode(self._data, pc.count_distinct(self._data).as_py()) values = modes.field(0) counts = modes.field(1) # counts sorted descending i.e counts[0] = max mask = pc.equal(counts, counts[0]) most_common = values.filter(mask) return type(self)(most_common)
def test_input_type_conversion(): # Automatic array conversion from Python arr = pc.add([1, 2], [4, None]) assert arr.to_pylist() == [5, None] # Automatic scalar conversion from Python arr = pc.add([1, 2], 4) assert arr.to_pylist() == [5, 6] # Other scalar type assert pc.equal(["foo", "bar", None], "foo").to_pylist() == [True, False, None]
def binary_col(op, l, r): """ interpretor for executing binary operator expressions """ if op == "+": return compute.add_checked(l, r) if op == "*": return compute.multiply_checked(l, r) if op == '-': return compute.subtract_checked(l, r) if op == "=": return compute.equal(l, r) if op == "<>": return compute.not_equal(l, r) if op == "!=": return compute.not_equal(l, r) if op == "or": return compute.or_(l, r) if op == "<": return compute.less(l, r) if op == ">": return compute.greater(l, r) if op == "/": return compute.divide_checked(l, r) if op == "and": return compute.and_(l, r) if op == "in": return compute.is_in(l, r) if op == "==": return compute.equal(l, r) if op == "<=": return compute.less_equal(l, r) if op == ">=": return compute.greater_equal(l, r) raise Exception("binary op not implemented")
def _get_row_mask_per_channel(ch_column: pa.array, channels: List[int]) -> List[int]: """Generates the mask arrays for each channel. This function creates a list, the length of which is the expected number of channels, and in each entry of that list lies a boolean array that says whether each row belongs to that channel. """ mask_per_channel = [] channels = _match_channels_to_reference(ch_column, channels) for ch in channels: mask_per_channel.append(pc.equal(ch, ch_column)) return mask_per_channel
def test_compare_chunked_array_mixed(): arr = pa.array([1, 2, 3, 4, None]) arr_chunked = pa.chunked_array([[1, 2, 3], [4, None]]) arr_chunked2 = pa.chunked_array([[1, 2], [3, 4, None]]) expected = pa.chunked_array([[True, True, True, True, None]]) for left, right in [ (arr, arr_chunked), (arr_chunked, arr), (arr_chunked, arr_chunked2), ]: result = pc.equal(left, right) assert result.equals(expected)
def get_vectors_for_date_df( self, date: datetime.datetime, vector_names: Sequence[str], realizations: Optional[Sequence[int]] = None, ) -> pd.DataFrame: timer = PerfTimer() columns_to_get = ["DATE", "REAL"] columns_to_get.extend(vector_names) table = self._get_or_read_table(columns_to_get) et_read_ms = timer.lap_ms() # Note that we use MS here to be aligned with storage type in arrow file lookup_date = pa.scalar(date, type=pa.timestamp("ms")) mask = pc.equal(table["DATE"], lookup_date) if realizations: real_mask = pc.is_in(table["REAL"], value_set=pa.array(realizations)) mask = pc.and_(mask, real_mask) table = table.drop(["DATE"]) # table = table.filter(mask).combine_chunks() table = table.filter(mask) et_filter_ms = timer.lap_ms() df = table.to_pandas() # df = table.to_pandas(split_blocks=True, zero_copy_only=True) # del table # not necessary, but a good practice et_to_pandas_ms = timer.lap_ms() LOGGER.debug( f"get_vectors_for_date_df() took: {timer.elapsed_ms()}ms (" f"read={et_read_ms}ms, " f"filter={et_filter_ms}ms, " f"to_pandas={et_to_pandas_ms}ms), " f"#vecs={len(vector_names)}, " f"#real={len(realizations) if realizations else 'all'}, " f"df.shape={df.shape}, file={Path(self._arrow_file_name).name}") return df
table_df = table.to_pandas() convert_options = csv.ConvertOptions( column_types={ "VendorID": pa.bool_(), # "trip_distance": pa.float16() }, true_values=["Y", "1"], false_values=["N", "2"]) table = csv.read_csv("../sec1-intro/yellow_tripdata_2020-01.csv.gz", convert_options=convert_options) print(table["store_and_fwd_flag"].unique(), table["store_and_fwd_flag"].nbytes // (1024**2), table["VendorID"].nbytes // 1024, table["store_and_fwd_flag"].nbytes // 1024) x = pa.array([False, True]).cast(pa.string()).cast(pa.bool_()) table_df = table.to_pandas() print(table_df.store_and_fwd_flag) mission_impossible = table.to_pandas(self_destruct=True) import pyarrow.compute as pc pc.equal(table["total_amount"], 0) pc.equal(table["total_amount"], 0.0) t0 = table.filter(pc.not_equal(table["total_amount"], 0.0)) pc.mean(pc.divide(t0["tip_amount"], t0["total_amount"])) # 18ms # The fair comparison is (also do on other computer)