Example #1
0
def test_tree_exp_builder():
    import pyarrow.gandiva as gandiva

    builder = gandiva.TreeExprBuilder()

    field_a = pa.field('a', pa.int32())
    field_b = pa.field('b', pa.int32())

    schema = pa.schema([field_a, field_b])

    field_result = pa.field('res', pa.int32())

    node_a = builder.make_field(field_a)
    node_b = builder.make_field(field_b)

    condition = builder.make_function("greater_than", [node_a, node_b],
                                      pa.bool_())
    if_node = builder.make_if(condition, node_a, node_b, pa.int32())

    expr = builder.make_expression(if_node, field_result)

    projector = gandiva.make_projector(schema, [expr],
                                       pa.default_memory_pool())

    a = pa.array([10, 12, -20, 5], type=pa.int32())
    b = pa.array([5, 15, 15, 17], type=pa.int32())
    e = pa.array([10, 15, 15, 17], type=pa.int32())
    input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b'])

    r, = projector.evaluate(input_batch)
    assert r.equals(e)
Example #2
0
def test_regex():
    import pyarrow.gandiva as gandiva

    elements = ["park", "sparkle", "bright spark and fire", "spark"]
    data = pa.array(elements, type=pa.string())
    table = pa.Table.from_arrays([data], names=['a'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    regex = builder.make_literal("%spark%", pa.string())
    like = builder.make_function("like", [node_a, regex], pa.bool_())

    field_result = pa.field("b", pa.bool_())
    expr = builder.make_expression(like, field_result)

    projector = gandiva.make_projector(table.schema, [expr],
                                       pa.default_memory_pool())

    r, = projector.evaluate(table.to_batches()[0])
    b = pa.array([False, True, True, True], type=pa.bool_())
    assert r.equals(b)
def test_filter_project():
    import pyarrow.gandiva as gandiva
    mpool = pa.default_memory_pool()
    # Create a table with some sample data
    array0 = pa.array([10, 12, -20, 5, 21, 29], pa.int32())
    array1 = pa.array([5, 15, 15, 17, 12, 3], pa.int32())
    array2 = pa.array([1, 25, 11, 30, -21, None], pa.int32())

    table = pa.Table.from_arrays([array0, array1, array2], ['a', 'b', 'c'])

    field_result = pa.field("res", pa.int32())

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    node_b = builder.make_field(table.schema.field("b"))
    node_c = builder.make_field(table.schema.field("c"))

    greater_than_function = builder.make_function("greater_than",
                                                  [node_a, node_b], pa.bool_())
    filter_condition = builder.make_condition(greater_than_function)

    project_condition = builder.make_function("less_than", [node_b, node_c],
                                              pa.bool_())
    if_node = builder.make_if(project_condition, node_b, node_c, pa.int32())
    expr = builder.make_expression(if_node, field_result)

    # Build a filter for the expressions.
    filter = gandiva.make_filter(table.schema, filter_condition)

    # Build a projector for the expressions.
    projector = gandiva.make_projector(table.schema, [expr], mpool, "UINT32")

    # Evaluate filter
    selection_vector = filter.evaluate(table.to_batches()[0], mpool)

    # Evaluate project
    r, = projector.evaluate(table.to_batches()[0], selection_vector)

    exp = pa.array([1, -21, None], pa.int32())
    assert r.equals(exp)
Example #4
0
def test_tree_exp_builder():
    import pyarrow.gandiva as gandiva

    builder = gandiva.TreeExprBuilder()

    field_a = pa.field('a', pa.int32())
    field_b = pa.field('b', pa.int32())

    schema = pa.schema([field_a, field_b])

    field_result = pa.field('res', pa.int32())

    node_a = builder.make_field(field_a)
    node_b = builder.make_field(field_b)

    assert node_a.return_type() == field_a.type

    condition = builder.make_function("greater_than", [node_a, node_b],
                                      pa.bool_())
    if_node = builder.make_if(condition, node_a, node_b, pa.int32())

    expr = builder.make_expression(if_node, field_result)

    assert expr.result().type == pa.int32()

    projector = gandiva.make_projector(schema, [expr],
                                       pa.default_memory_pool())

    # Gandiva generates compute kernel function named `@expr_X`
    assert projector.llvm_ir.find("@expr_") != -1

    a = pa.array([10, 12, -20, 5], type=pa.int32())
    b = pa.array([5, 15, 15, 17], type=pa.int32())
    e = pa.array([10, 15, 15, 17], type=pa.int32())
    input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b'])

    r, = projector.evaluate(input_batch)
    assert r.equals(e)
Example #5
0
def test_table():
    import pyarrow.gandiva as gandiva

    df = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]})
    table = pa.Table.from_pandas(df)

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    node_b = builder.make_field(table.schema.field_by_name("b"))

    sum = builder.make_function("add", [node_a, node_b], pa.float64())

    field_result = pa.field("c", pa.float64())
    expr = builder.make_expression(sum, field_result)

    projector = gandiva.make_projector(table.schema, [expr],
                                       pa.default_memory_pool())

    # TODO: Add .evaluate function which can take Tables instead of
    # RecordBatches
    r, = projector.evaluate(table.to_batches()[0])

    e = pa.Array.from_pandas(df["a"] + df["b"])
    assert r.equals(e)
Example #6
0
def test_table():
    import pyarrow.gandiva as gandiva

    table = pa.Table.from_arrays(
        [pa.array([1.0, 2.0]), pa.array([3.0, 4.0])], ['a', 'b'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    node_b = builder.make_field(table.schema.field("b"))

    sum = builder.make_function("add", [node_a, node_b], pa.float64())

    field_result = pa.field("c", pa.float64())
    expr = builder.make_expression(sum, field_result)

    projector = gandiva.make_projector(table.schema, [expr],
                                       pa.default_memory_pool())

    # TODO: Add .evaluate function which can take Tables instead of
    # RecordBatches
    r, = projector.evaluate(table.to_batches()[0])

    e = pa.array([4.0, 6.0])
    assert r.equals(e)