def test_compare_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con([1, 2, 3, None]) scalar = pa.scalar(2) result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="int64") result = pc.equal(arr, nascalar) assert result.to_pylist() == [None, None, None, None] result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def test_compare_array(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr1 = con([1, 2, 3, 4, None]) arr2 = con([1, 1, 4, None, 4]) result = pc.equal(arr1, arr2) assert result.equals(con([True, False, False, None, None])) result = pc.not_equal(arr1, arr2) assert result.equals(con([False, True, True, None, None])) result = pc.less(arr1, arr2) assert result.equals(con([False, False, True, None, None])) result = pc.less_equal(arr1, arr2) assert result.equals(con([True, False, True, None, None])) result = pc.greater(arr1, arr2) assert result.equals(con([False, True, False, None, None])) result = pc.greater_equal(arr1, arr2) assert result.equals(con([True, True, False, None, None]))
def test_compare_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con([1, 2, 3, None]) # TODO this is a hacky way to construct a scalar .. scalar = pa.array([2]).sum() result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def test_compare_string_scalar(typ): if typ == "array": def con(values): return pa.array(values) else: def con(values): return pa.chunked_array([values]) arr = con(['a', 'b', 'c', None]) scalar = pa.scalar('b') result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="string") result = pc.equal(arr, nascalar) isnull = pc.is_null(result) assert isnull.equals(con([True, True, True, True])) result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None]))
def binary_col(op, l, r): """ interpretor for executing binary operator expressions """ if op == "+": return compute.add_checked(l, r) if op == "*": return compute.multiply_checked(l, r) if op == '-': return compute.subtract_checked(l, r) if op == "=": return compute.equal(l, r) if op == "<>": return compute.not_equal(l, r) if op == "!=": return compute.not_equal(l, r) if op == "or": return compute.or_(l, r) if op == "<": return compute.less(l, r) if op == ">": return compute.greater(l, r) if op == "/": return compute.divide_checked(l, r) if op == "and": return compute.and_(l, r) if op == "in": return compute.is_in(l, r) if op == "==": return compute.equal(l, r) if op == "<=": return compute.less_equal(l, r) if op == ">=": return compute.greater_equal(l, r) raise Exception("binary op not implemented")
table_df = table.to_pandas() convert_options = csv.ConvertOptions( column_types={ "VendorID": pa.bool_(), # "trip_distance": pa.float16() }, true_values=["Y", "1"], false_values=["N", "2"]) table = csv.read_csv("../sec1-intro/yellow_tripdata_2020-01.csv.gz", convert_options=convert_options) print(table["store_and_fwd_flag"].unique(), table["store_and_fwd_flag"].nbytes // (1024**2), table["VendorID"].nbytes // 1024, table["store_and_fwd_flag"].nbytes // 1024) x = pa.array([False, True]).cast(pa.string()).cast(pa.bool_()) table_df = table.to_pandas() print(table_df.store_and_fwd_flag) mission_impossible = table.to_pandas(self_destruct=True) import pyarrow.compute as pc pc.equal(table["total_amount"], 0) pc.equal(table["total_amount"], 0.0) t0 = table.filter(pc.not_equal(table["total_amount"], 0.0)) pc.mean(pc.divide(t0["tip_amount"], t0["total_amount"])) # 18ms # The fair comparison is (also do on other computer)