def test_tree_exp_builder(): import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() field_a = pa.field('a', pa.int32()) field_b = pa.field('b', pa.int32()) schema = pa.schema([field_a, field_b]) field_result = pa.field('res', pa.int32()) node_a = builder.make_field(field_a) node_b = builder.make_field(field_b) condition = builder.make_function("greater_than", [node_a, node_b], pa.bool_()) if_node = builder.make_if(condition, node_a, node_b, pa.int32()) expr = builder.make_expression(if_node, field_result) projector = gandiva.make_projector(schema, [expr], pa.default_memory_pool()) a = pa.array([10, 12, -20, 5], type=pa.int32()) b = pa.array([5, 15, 15, 17], type=pa.int32()) e = pa.array([10, 15, 15, 17], type=pa.int32()) input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b']) r, = projector.evaluate(input_batch) assert r.equals(e)
def test_regex(): import pyarrow.gandiva as gandiva elements = ["park", "sparkle", "bright spark and fire", "spark"] data = pa.array(elements, type=pa.string()) table = pa.Table.from_arrays([data], names=['a']) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field_by_name("a")) regex = builder.make_literal("%spark%", pa.string()) like = builder.make_function("like", [node_a, regex], pa.bool_()) field_result = pa.field("b", pa.bool_()) expr = builder.make_expression(like, field_result) projector = gandiva.make_projector(table.schema, [expr], pa.default_memory_pool()) r, = projector.evaluate(table.to_batches()[0]) b = pa.array([False, True, True, True], type=pa.bool_()) assert r.equals(b)
def test_filter_project(): import pyarrow.gandiva as gandiva mpool = pa.default_memory_pool() # Create a table with some sample data array0 = pa.array([10, 12, -20, 5, 21, 29], pa.int32()) array1 = pa.array([5, 15, 15, 17, 12, 3], pa.int32()) array2 = pa.array([1, 25, 11, 30, -21, None], pa.int32()) table = pa.Table.from_arrays([array0, array1, array2], ['a', 'b', 'c']) field_result = pa.field("res", pa.int32()) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) node_b = builder.make_field(table.schema.field("b")) node_c = builder.make_field(table.schema.field("c")) greater_than_function = builder.make_function("greater_than", [node_a, node_b], pa.bool_()) filter_condition = builder.make_condition(greater_than_function) project_condition = builder.make_function("less_than", [node_b, node_c], pa.bool_()) if_node = builder.make_if(project_condition, node_b, node_c, pa.int32()) expr = builder.make_expression(if_node, field_result) # Build a filter for the expressions. filter = gandiva.make_filter(table.schema, filter_condition) # Build a projector for the expressions. projector = gandiva.make_projector(table.schema, [expr], mpool, "UINT32") # Evaluate filter selection_vector = filter.evaluate(table.to_batches()[0], mpool) # Evaluate project r, = projector.evaluate(table.to_batches()[0], selection_vector) exp = pa.array([1, -21, None], pa.int32()) assert r.equals(exp)
def test_tree_exp_builder(): import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() field_a = pa.field('a', pa.int32()) field_b = pa.field('b', pa.int32()) schema = pa.schema([field_a, field_b]) field_result = pa.field('res', pa.int32()) node_a = builder.make_field(field_a) node_b = builder.make_field(field_b) assert node_a.return_type() == field_a.type condition = builder.make_function("greater_than", [node_a, node_b], pa.bool_()) if_node = builder.make_if(condition, node_a, node_b, pa.int32()) expr = builder.make_expression(if_node, field_result) assert expr.result().type == pa.int32() projector = gandiva.make_projector(schema, [expr], pa.default_memory_pool()) # Gandiva generates compute kernel function named `@expr_X` assert projector.llvm_ir.find("@expr_") != -1 a = pa.array([10, 12, -20, 5], type=pa.int32()) b = pa.array([5, 15, 15, 17], type=pa.int32()) e = pa.array([10, 15, 15, 17], type=pa.int32()) input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b']) r, = projector.evaluate(input_batch) assert r.equals(e)
def test_table(): import pyarrow.gandiva as gandiva df = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]}) table = pa.Table.from_pandas(df) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field_by_name("a")) node_b = builder.make_field(table.schema.field_by_name("b")) sum = builder.make_function("add", [node_a, node_b], pa.float64()) field_result = pa.field("c", pa.float64()) expr = builder.make_expression(sum, field_result) projector = gandiva.make_projector(table.schema, [expr], pa.default_memory_pool()) # TODO: Add .evaluate function which can take Tables instead of # RecordBatches r, = projector.evaluate(table.to_batches()[0]) e = pa.Array.from_pandas(df["a"] + df["b"]) assert r.equals(e)
def test_table(): import pyarrow.gandiva as gandiva table = pa.Table.from_arrays( [pa.array([1.0, 2.0]), pa.array([3.0, 4.0])], ['a', 'b']) builder = gandiva.TreeExprBuilder() node_a = builder.make_field(table.schema.field("a")) node_b = builder.make_field(table.schema.field("b")) sum = builder.make_function("add", [node_a, node_b], pa.float64()) field_result = pa.field("c", pa.float64()) expr = builder.make_expression(sum, field_result) projector = gandiva.make_projector(table.schema, [expr], pa.default_memory_pool()) # TODO: Add .evaluate function which can take Tables instead of # RecordBatches r, = projector.evaluate(table.to_batches()[0]) e = pa.array([4.0, 6.0]) assert r.equals(e)