Example #1
0
        def gandiva_query(table, query):
            """
            Evaluate string query on the passed table.

            Parameters
            ----------
            table : pyarrow.Table
                Table to evaluate query on.
            query : str
                Query string to evaluate on the `table` columns.

            Returns
            -------
            pyarrow.Table
            """
            expr = gen_table_expr(table, query)
            if not can_be_condition(expr):
                raise ValueError("Root operation should be a filter.")

            # We use this import here because of https://github.com/modin-project/modin/issues/3849,
            # after the issue is fixed we should put the import at the top of this file
            import pyarrow.gandiva as gandiva

            builder = gandiva.TreeExprBuilder()
            root = build_node(table, expr.terms, builder)
            cond = builder.make_condition(root)
            filt = gandiva.make_filter(table.schema, cond)
            sel_vec = filt.evaluate(table.to_batches()[0],
                                    pa.default_memory_pool())
            result = filter_with_selection_vector(table, sel_vec)
            return result
Example #2
0
def test_in_expr():
    import pyarrow.gandiva as gandiva

    arr = pa.array(["ga", "an", "nd", "di", "iv", "va"])
    table = pa.Table.from_arrays([arr], ["a"])

    # string
    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, ["an", "nd"], pa.string())
    condition = builder.make_condition(cond)
    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 2]

    # int32
    arr = pa.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 4])
    table = pa.Table.from_arrays([arr.cast(pa.int32())], ["a"])
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [1, 5], pa.int32())
    condition = builder.make_condition(cond)
    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 3, 4, 8]

    # int64
    arr = pa.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 4])
    table = pa.Table.from_arrays([arr], ["a"])
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [1, 5], pa.int64())
    condition = builder.make_condition(cond)
    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 3, 4, 8]
Example #3
0
def test_tree_exp_builder():
    import pyarrow.gandiva as gandiva

    builder = gandiva.TreeExprBuilder()

    field_a = pa.field('a', pa.int32())
    field_b = pa.field('b', pa.int32())

    schema = pa.schema([field_a, field_b])

    field_result = pa.field('res', pa.int32())

    node_a = builder.make_field(field_a)
    node_b = builder.make_field(field_b)

    condition = builder.make_function("greater_than", [node_a, node_b],
                                      pa.bool_())
    if_node = builder.make_if(condition, node_a, node_b, pa.int32())

    expr = builder.make_expression(if_node, field_result)

    projector = gandiva.make_projector(schema, [expr],
                                       pa.default_memory_pool())

    a = pa.array([10, 12, -20, 5], type=pa.int32())
    b = pa.array([5, 15, 15, 17], type=pa.int32())
    e = pa.array([10, 15, 15, 17], type=pa.int32())
    input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b'])

    r, = projector.evaluate(input_batch)
    assert r.equals(e)
Example #4
0
        def gandiva_query(table, query):
            """
            Evaluate string query on the passed table.

            Parameters
            ----------
            table : pyarrow.Table
                Table to evaluate query on.
            query : str
                Query string to evaluate on the `table` columns.

            Returns
            -------
            pyarrow.Table
            """
            expr = gen_table_expr(table, query)
            if not can_be_condition(expr):
                raise ValueError("Root operation should be a filter.")
            builder = gandiva.TreeExprBuilder()
            root = build_node(table, expr.terms, builder)
            cond = builder.make_condition(root)
            filt = gandiva.make_filter(table.schema, cond)
            sel_vec = filt.evaluate(table.to_batches()[0],
                                    pa.default_memory_pool())
            result = filter_with_selection_vector(table, sel_vec)
            return result
Example #5
0
def test_boolean():
    import pyarrow.gandiva as gandiva

    df = pd.DataFrame({
        "a": [1., 31., 46., 3., 57., 44., 22.],
        "b": [5., 45., 36., 73., 83., 23., 76.]
    })
    table = pa.Table.from_pandas(df)

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    node_b = builder.make_field(table.schema.field_by_name("b"))
    fifty = builder.make_literal(50.0, pa.float64())
    eleven = builder.make_literal(11.0, pa.float64())

    cond_1 = builder.make_function("less_than", [node_a, fifty], pa.bool_())
    cond_2 = builder.make_function("greater_than", [node_a, node_b],
                                   pa.bool_())
    cond_3 = builder.make_function("less_than", [node_b, eleven], pa.bool_())
    cond = builder.make_or([builder.make_and([cond_1, cond_2]), cond_3])
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [0, 2, 5]
 def gandiva_query2(table, query):
     expr = gen_table_expr(table, query)
     if not can_be_condition(expr):
         raise ValueError("Root operation should be a filter.")
     builder = gandiva.TreeExprBuilder()
     root = build_node(table, expr.terms, builder)
     cond = builder.make_condition(root)
     filt = gandiva.make_filter(table.schema, cond)
     return filt
Example #7
0
 def gandiva_query(table, query):
     expr = gen_table_expr(table, query)
     if not can_be_condition(expr):
         raise ValueError("Root operation should be a filter.")
     builder = gandiva.TreeExprBuilder()
     root = build_node(table, expr.terms, builder)
     cond = builder.make_condition(root)
     filt = gandiva.make_filter(table.schema, cond)
     sel_vec = filt.evaluate(table.to_batches()[0], pa.default_memory_pool())
     result = filter_with_selection_vector(table, sel_vec)
     return result
Example #8
0
def test_filter():
    import pyarrow.gandiva as gandiva

    df = pd.DataFrame({"a": [1.0 * i for i in range(10000)]})
    table = pa.Table.from_pandas(df)

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    thousand = builder.make_literal(1000.0, pa.float64())
    cond = builder.make_function("less_than", [node_a, thousand], pa.bool_())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
Example #9
0
def test_to_string():
    import pyarrow.gandiva as gandiva
    builder = gandiva.TreeExprBuilder()

    assert str(builder.make_literal(
        2.0, pa.float64())).startswith('(const double) 2 raw(')
    assert str(builder.make_literal(2, pa.int64())) == '(const int64) 2'
    assert str(builder.make_field(pa.field('x', pa.float64()))) == '(double) x'
    assert str(builder.make_field(pa.field('y', pa.string()))) == '(string) y'

    field_z = builder.make_field(pa.field('z', pa.bool_()))
    func_node = builder.make_function('not', [field_z], pa.bool_())
    assert str(func_node) == 'bool not((bool) z)'

    field_y = builder.make_field(pa.field('y', pa.bool_()))
    and_node = builder.make_and([func_node, field_y])
    assert str(and_node) == 'bool not((bool) z) && (bool) y'
def test_filter():
    import pyarrow.gandiva as gandiva

    table = pa.Table.from_arrays([pa.array([1.0 * i for i in range(10000)])],
                                 ['a'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    thousand = builder.make_literal(1000.0, pa.float64())
    cond = builder.make_function("less_than", [node_a, thousand], pa.bool_())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    # Gandiva generates compute kernel function named `@expr_X`
    assert filter.llvm_ir.find("@expr_") != -1

    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
Example #11
0
def test_regex():
    import pyarrow.gandiva as gandiva

    elements = ["park", "sparkle", "bright spark and fire", "spark"]
    data = pa.array(elements, type=pa.string())
    table = pa.Table.from_arrays([data], names=['a'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    regex = builder.make_literal("%spark%", pa.string())
    like = builder.make_function("like", [node_a, regex], pa.bool_())

    field_result = pa.field("b", pa.bool_())
    expr = builder.make_expression(like, field_result)

    projector = gandiva.make_projector(table.schema, [expr],
                                       pa.default_memory_pool())

    r, = projector.evaluate(table.to_batches()[0])
    b = pa.array([False, True, True, True], type=pa.bool_())
    assert r.equals(b)
def test_filter_project():
    import pyarrow.gandiva as gandiva
    mpool = pa.default_memory_pool()
    # Create a table with some sample data
    array0 = pa.array([10, 12, -20, 5, 21, 29], pa.int32())
    array1 = pa.array([5, 15, 15, 17, 12, 3], pa.int32())
    array2 = pa.array([1, 25, 11, 30, -21, None], pa.int32())

    table = pa.Table.from_arrays([array0, array1, array2], ['a', 'b', 'c'])

    field_result = pa.field("res", pa.int32())

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    node_b = builder.make_field(table.schema.field("b"))
    node_c = builder.make_field(table.schema.field("c"))

    greater_than_function = builder.make_function("greater_than",
                                                  [node_a, node_b], pa.bool_())
    filter_condition = builder.make_condition(greater_than_function)

    project_condition = builder.make_function("less_than", [node_b, node_c],
                                              pa.bool_())
    if_node = builder.make_if(project_condition, node_b, node_c, pa.int32())
    expr = builder.make_expression(if_node, field_result)

    # Build a filter for the expressions.
    filter = gandiva.make_filter(table.schema, filter_condition)

    # Build a projector for the expressions.
    projector = gandiva.make_projector(table.schema, [expr], mpool, "UINT32")

    # Evaluate filter
    selection_vector = filter.evaluate(table.to_batches()[0], mpool)

    # Evaluate project
    r, = projector.evaluate(table.to_batches()[0], selection_vector)

    exp = pa.array([1, -21, None], pa.int32())
    assert r.equals(exp)
Example #13
0
def test_tree_exp_builder():
    import pyarrow.gandiva as gandiva

    builder = gandiva.TreeExprBuilder()

    field_a = pa.field('a', pa.int32())
    field_b = pa.field('b', pa.int32())

    schema = pa.schema([field_a, field_b])

    field_result = pa.field('res', pa.int32())

    node_a = builder.make_field(field_a)
    node_b = builder.make_field(field_b)

    assert node_a.return_type() == field_a.type

    condition = builder.make_function("greater_than", [node_a, node_b],
                                      pa.bool_())
    if_node = builder.make_if(condition, node_a, node_b, pa.int32())

    expr = builder.make_expression(if_node, field_result)

    assert expr.result().type == pa.int32()

    projector = gandiva.make_projector(schema, [expr],
                                       pa.default_memory_pool())

    # Gandiva generates compute kernel function named `@expr_X`
    assert projector.llvm_ir.find("@expr_") != -1

    a = pa.array([10, 12, -20, 5], type=pa.int32())
    b = pa.array([5, 15, 15, 17], type=pa.int32())
    e = pa.array([10, 15, 15, 17], type=pa.int32())
    input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b'])

    r, = projector.evaluate(input_batch)
    assert r.equals(e)
Example #14
0
        def gandiva_query2(table, query):
            """
            Build gandiva filter based on the specified query.

            Parameters
            ----------
            table : pyarrow.Table
                Table to evaluate query on.
            query : str
                Query string to evaluate on the `table` columns.

            Returns
            -------
            pyarrow.gandiva.Filter
            """
            expr = gen_table_expr(table, query)
            if not can_be_condition(expr):
                raise ValueError("Root operation should be a filter.")
            builder = gandiva.TreeExprBuilder()
            root = build_node(table, expr.terms, builder)
            cond = builder.make_condition(root)
            filt = gandiva.make_filter(table.schema, cond)
            return filt
Example #15
0
def test_literals():
    import pyarrow.gandiva as gandiva

    builder = gandiva.TreeExprBuilder()

    builder.make_literal(True, pa.bool_())
    builder.make_literal(0, pa.uint8())
    builder.make_literal(1, pa.uint16())
    builder.make_literal(2, pa.uint32())
    builder.make_literal(3, pa.uint64())
    builder.make_literal(4, pa.int8())
    builder.make_literal(5, pa.int16())
    builder.make_literal(6, pa.int32())
    builder.make_literal(7, pa.int64())
    builder.make_literal(8.0, pa.float32())
    builder.make_literal(9.0, pa.float64())
    builder.make_literal("hello", pa.string())
    builder.make_literal(b"world", pa.binary())

    builder.make_literal(True, "bool")
    builder.make_literal(0, "uint8")
    builder.make_literal(1, "uint16")
    builder.make_literal(2, "uint32")
    builder.make_literal(3, "uint64")
    builder.make_literal(4, "int8")
    builder.make_literal(5, "int16")
    builder.make_literal(6, "int32")
    builder.make_literal(7, "int64")
    builder.make_literal(8.0, "float32")
    builder.make_literal(9.0, "float64")
    builder.make_literal("hello", "string")
    builder.make_literal(b"world", "binary")

    with pytest.raises(TypeError):
        builder.make_literal("hello", pa.int64())
    with pytest.raises(TypeError):
        builder.make_literal(True, None)
Example #16
0
def test_table():
    import pyarrow.gandiva as gandiva

    df = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]})
    table = pa.Table.from_pandas(df)

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field_by_name("a"))
    node_b = builder.make_field(table.schema.field_by_name("b"))

    sum = builder.make_function("add", [node_a, node_b], pa.float64())

    field_result = pa.field("c", pa.float64())
    expr = builder.make_expression(sum, field_result)

    projector = gandiva.make_projector(table.schema, [expr],
                                       pa.default_memory_pool())

    # TODO: Add .evaluate function which can take Tables instead of
    # RecordBatches
    r, = projector.evaluate(table.to_batches()[0])

    e = pa.Array.from_pandas(df["a"] + df["b"])
    assert r.equals(e)
Example #17
0
def test_table():
    import pyarrow.gandiva as gandiva

    table = pa.Table.from_arrays(
        [pa.array([1.0, 2.0]), pa.array([3.0, 4.0])], ['a', 'b'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    node_b = builder.make_field(table.schema.field("b"))

    sum = builder.make_function("add", [node_a, node_b], pa.float64())

    field_result = pa.field("c", pa.float64())
    expr = builder.make_expression(sum, field_result)

    projector = gandiva.make_projector(table.schema, [expr],
                                       pa.default_memory_pool())

    # TODO: Add .evaluate function which can take Tables instead of
    # RecordBatches
    r, = projector.evaluate(table.to_batches()[0])

    e = pa.array([4.0, 6.0])
    assert r.equals(e)
Example #18
0
def test_boolean():
    import pyarrow.gandiva as gandiva

    table = pa.Table.from_arrays([
        pa.array([1., 31., 46., 3., 57., 44., 22.]),
        pa.array([5., 45., 36., 73., 83., 23., 76.])
    ], ['a', 'b'])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    node_b = builder.make_field(table.schema.field("b"))
    fifty = builder.make_literal(50.0, pa.float64())
    eleven = builder.make_literal(11.0, pa.float64())

    cond_1 = builder.make_function("less_than", [node_a, fifty], pa.bool_())
    cond_2 = builder.make_function("greater_than", [node_a, node_b],
                                   pa.bool_())
    cond_3 = builder.make_function("less_than", [node_b, eleven], pa.bool_())
    cond = builder.make_or([builder.make_and([cond_1, cond_2]), cond_3])
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert result.to_array().equals(pa.array([0, 2, 5], type=pa.uint32()))
Example #19
0
def test_in_expr_todo():
    import pyarrow.gandiva as gandiva
    # TODO: Implement reasonable support for timestamp, time & date.
    # Current exceptions:
    # pyarrow.lib.ArrowException: ExpressionValidationError:
    # Evaluation expression for IN clause returns XXXX values are of typeXXXX

    # binary
    arr = pa.array([b"ga", b"an", b"nd", b"di", b"iv", b"va"])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [b'an', b'nd'], pa.binary())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1, 2]

    # timestamp
    datetime_1 = datetime.datetime.utcfromtimestamp(1542238951.621877)
    datetime_2 = datetime.datetime.utcfromtimestamp(1542238911.621877)
    datetime_3 = datetime.datetime.utcfromtimestamp(1542238051.621877)

    arr = pa.array([datetime_1, datetime_2, datetime_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [datetime_2], pa.timestamp('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # time
    time_1 = datetime_1.time()
    time_2 = datetime_2.time()
    time_3 = datetime_3.time()

    arr = pa.array([time_1, time_2, time_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [time_2], pa.time64('ms'))
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]

    # date
    date_1 = datetime_1.date()
    date_2 = datetime_2.date()
    date_3 = datetime_3.date()

    arr = pa.array([date_1, date_2, date_3])
    table = pa.Table.from_arrays([arr], ["a"])

    builder = gandiva.TreeExprBuilder()
    node_a = builder.make_field(table.schema.field("a"))
    cond = builder.make_in_expression(node_a, [date_2], pa.date32())
    condition = builder.make_condition(cond)

    filter = gandiva.make_filter(table.schema, condition)
    result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
    assert list(result.to_array()) == [1]