Python Value Examples, purplequery.evaluatable_node.Value Python Examples

Example #1

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

 def test_value_repr(self, value, type_):
     # type: (Optional[LiteralType], Optional[BQScalarType]) -> None
     '''Check Value's string representation'''
     node = Value(value, type_)
     representation = 'Value(type_={}, value={})'.format(
         type_.__repr__(), value.__repr__())
     self.assertEqual(node.__repr__(), representation)

Example #2

0

Show file

File: bq_operator_test.py Project: verilylifesciences/purplequery

 def test_not_an_operator(self):
     with self.assertRaisesRegexp(ValueError,
                                  'Unknown operator string bar'):
         _reparse_binary_expression([
             Value(3, BQScalarType.INTEGER),
             'bar',
             Value(4, BQScalarType.INTEGER),
         ])

Example #3

0

Show file

 def test_value_eval(self):
     # type: () -> None
     # A constant is repeated for each row in the context table.
     value = Value(12345, BQScalarType.INTEGER)
     context = EvaluationContext(self.small_table_context)
     context.add_table_from_node(TableReference(('my_project', 'my_dataset', 'my_table')), 'foo')
     typed_series = value.evaluate(context)
     assert isinstance(typed_series, TypedSeries)
     self.assertEqual(list(typed_series.series), [12345, 12345])

Example #4

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

    def test_if_different_types(self):
        condition = Value(True, BQScalarType.BOOLEAN)
        then = Value('yes', BQScalarType.STRING)
        else_ = Value(1, BQScalarType.INTEGER)
        if_expression = If(condition, then, else_)

        error = (r"Cannot implicitly coerce the given types: "
                 r"\(BQScalarType.STRING, BQScalarType.INTEGER\)")
        with self.assertRaisesRegexp(ValueError, error):
            if_expression.evaluate(EMPTY_CONTEXT)

Example #5

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

    def test_if_error(self):
        condition = Value(5, BQScalarType.INTEGER)
        then = Value(0, BQScalarType.INTEGER)
        else_ = Value(1, BQScalarType.INTEGER)
        if_expression = If(condition, then, else_)

        error = escape("IF condition isn't boolean! Found: {}".format(
            str(condition.evaluate(EMPTY_CONTEXT))))
        with self.assertRaisesRegexp(ValueError, error):
            if_expression.evaluate(EMPTY_CONTEXT)

Example #6

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

    def test_if_empty_context(self, condition_bool, result):
        # type: (bool, int) -> None
        condition = Value(condition_bool, BQScalarType.BOOLEAN)
        then = Value(0, BQScalarType.INTEGER)
        else_ = Value(1, BQScalarType.INTEGER)
        # IF [condition] THEN 0 ELSE 1
        if_expression = If(condition, then, else_)

        typed_series = if_expression.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [result])

Example #7

0

Show file

    def test_in_check(self, direction, result):
        # type: (str, List[bool]) -> None
        expression = Field(('a',))
        elements = (Value(1, type_=BQScalarType.INTEGER), Value(3, type_=BQScalarType.INTEGER))
        in_check = InCheck(expression, direction, elements)

        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(TableReference(('my_project', 'my_dataset', 'my_table')),
                                    EMPTY_NODE)
        typed_series = in_check.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), result)

Example #8

0

Show file

    def test_if(self):
        condition = BinaryExpression(Field(('a',)), '>', Value(1, BQScalarType.INTEGER))
        then = Value('yes', BQScalarType.STRING)
        else_ = Value('no', BQScalarType.STRING)
        # IF a > 1 THEN "yes" ELSE "no"
        if_expression = If(condition, then, else_)

        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(TableReference(('my_project', 'my_dataset', 'my_table')),
                                    EMPTY_NODE)
        typed_series = if_expression.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), ['no', 'yes'])

Example #9

0

Show file

    def test_query_expression_limit(self):
        # type: () -> None
        from_ = DataSource((TableReference(
            ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), [])
        selector = StarSelector(EMPTY_NODE, EMPTY_NODE, EMPTY_NODE)
        select = Select(EMPTY_NODE, [selector], from_, EMPTY_NODE, EMPTY_NODE,
                        EMPTY_NODE)

        limit = Value(1, BQScalarType.INTEGER)
        offset = Value(1, BQScalarType.INTEGER)
        qe = QueryExpression(EMPTY_NODE, select, EMPTY_NODE, (limit, offset))
        dataframe, table_name = qe.get_dataframe(self.table_context)

        self.assertEqual(dataframe.to_list_of_lists(), [[2]])

Example #10

0

Show file

    def test_extract(self, part, result):
        # type: (str, int) -> None
        extract = Extract(part, Value(pd.Timestamp('2019-05-09'), BQScalarType.TIMESTAMP))

        typed_series = extract.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [result])

Example #11

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

    def test_not(self):
        expression = Value(True, BQScalarType.BOOLEAN)
        not_expression = Not(expression)

        typed_series = not_expression.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [False])

Example #12

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

    def test_unary_negation(self, initial_value, value_type, result_value):
        # type: (Any, BQScalarType, Any) -> None
        expression = Value(initial_value, value_type)
        negation = UnaryNegation(expression)

        typed_series = negation.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [result_value])

Example #13

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

    def test_extract_unimplemented(self):
        extract = Extract(
            'WEEK(TUESDAY)',
            Value(pd.Timestamp('2019-05-09'), BQScalarType.TIMESTAMP))

        with self.assertRaisesRegexp(NotImplementedError,
                                     r'WEEK\(TUESDAY\) not implemented'):
            extract.evaluate(EMPTY_CONTEXT)

Example #14

0

Show file

    def test_unary_negation_error(self, value, value_type):
        # type: (Any, BQScalarType) -> None
        expression = Value(value, value_type)
        negation = UnaryNegation(expression)

        error = ("UnaryNegation expression supports only integers and floats, got: {}"
                 .format(value_type))
        with self.assertRaisesRegexp(TypeError, error):
            negation.evaluate(EMPTY_CONTEXT)

Example #15

0

Show file

File: bq_operator_test.py Project: verilylifesciences/purplequery

 def test_even_length_sequence_raises(self):
     with self.assertRaisesRegexp(ValueError,
                                  'Sequence must be of odd length'):
         _reparse_binary_expression([Value(3, BQScalarType.INTEGER), '+'])

Example #16

0

Show file

class DataframeNodeTest(unittest.TestCase):
    def setUp(self):
        # type: () -> None
        self.table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1], [2], [3]],
                                                columns=['a']),
                                   types=[BQScalarType.INTEGER])
                }
            }
        })

    def test_marker_syntax_tree_node(self):
        # type: () -> None
        self.assertEqual(Select.literal(), 'SELECT')

    def test_query_expression(self):
        # type: () -> None
        from_ = DataSource((TableReference(
            ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), [])
        selector = StarSelector(EMPTY_NODE, EMPTY_NODE, EMPTY_NODE)
        select = Select(EMPTY_NODE, [selector], from_, EMPTY_NODE, EMPTY_NODE,
                        EMPTY_NODE)
        qe = QueryExpression(EMPTY_NODE, select, EMPTY_NODE, EMPTY_NODE)
        dataframe, table_name = qe.get_dataframe(self.table_context)

        self.assertEqual(table_name, None)
        self.assertEqual(dataframe.to_list_of_lists(), [[1], [2], [3]])
        self.assertEqual(list(dataframe.dataframe), ['a'])
        self.assertEqual(dataframe.types, [BQScalarType.INTEGER])

    def test_query_expression_limit(self):
        # type: () -> None
        from_ = DataSource((TableReference(
            ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), [])
        selector = StarSelector(EMPTY_NODE, EMPTY_NODE, EMPTY_NODE)
        select = Select(EMPTY_NODE, [selector], from_, EMPTY_NODE, EMPTY_NODE,
                        EMPTY_NODE)

        limit = Value(1, BQScalarType.INTEGER)
        offset = Value(1, BQScalarType.INTEGER)
        qe = QueryExpression(EMPTY_NODE, select, EMPTY_NODE, (limit, offset))
        dataframe, table_name = qe.get_dataframe(self.table_context)

        self.assertEqual(dataframe.to_list_of_lists(), [[2]])

    @data(
        dict(query_expression=
             'select a as d from my_table union all select c from my_table',
             expected_result=[[1], [3]]),
        dict(
            query_expression='select * from my_table union all select 4, 5, 6',
            expected_result=[[1, 2, 3], [4, 5, 6]]),
        dict(query_expression='select 1 union all select 2.0',
             expected_result=[[1.0], [2.0]]),
    )
    @unpack
    def test_query_expression_set_operation(self, query_expression,
                                            expected_result):
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']), [
                            BQScalarType.INTEGER, BQScalarType.INTEGER,
                            BQScalarType.INTEGER
                        ])
                }
            }
        })
        query_expression_node, leftover = query_expression_rule(
            tokenize(query_expression))
        dataframe, unused_table_name = query_expression_node.get_dataframe(
            table_context)
        self.assertFalse(leftover)
        self.assertEqual(dataframe.to_list_of_lists(), expected_result)

    @data(
        dict(query_expression='select 1 union all select 2, 3',
             error='mismatched column count: 1 vs 2'),
        dict(query_expression='select 1 union all select "foo"',
             error='Cannot implicitly coerce the given types'),
    )
    @unpack
    def test_query_expression_set_operation_error(self, query_expression,
                                                  error):
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']), [
                            BQScalarType.INTEGER, BQScalarType.INTEGER,
                            BQScalarType.INTEGER
                        ])
                }
            }
        })
        query_expression_node, leftover = query_expression_rule(
            tokenize(query_expression))
        self.assertFalse(leftover)
        with self.assertRaisesRegexp(ValueError, error):
            query_expression_node.get_dataframe(table_context)

    @data(
        dict(query_expression=
             'WITH q1 as (SELECT a+1 as b FROM my_table) SELECT * from q1',
             expected_result=[[2], [3], [4]]),
        # This example is closely based on
        # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#with-clause
        dict(
            query_expression=(
                'WITH q1 AS (SELECT a+1 as b FROM my_table LIMIT 1)'  # q1 is [[2]]
                'SELECT *'
                'FROM'
                '  (WITH q2 AS (SELECT b+1 as c FROM q1),'  # q1 resolves to [[2]]
                '        q3 AS (SELECT b+2 as d FROM q1),'  # q1 resolves to [[2]]
                '        q1 AS (SELECT b+4 as e FROM q1),'  # q1 (in the query) resolves to [[2]]
                '        q4 AS (SELECT e+8 as f FROM q1)'  # q1 resolves to the WITH subquery
                # on the previous line.
                '   SELECT e, c, d, f FROM q1, q2, q3, q4)'
            ),  # q1 resolves to 3rd inner WITH subquery.
            expected_result=[[6, 3, 4, 14]],
        ),
    )
    @unpack
    def test_with_clause(self, query_expression, expected_result):
        # type: (str, List[List[int]]) -> None
        query_expression_node, leftover = query_expression_rule(
            tokenize(query_expression))
        self.assertFalse(leftover)
        assert isinstance(query_expression_node, QueryExpression)
        dataframe, _ = query_expression_node.get_dataframe(self.table_context)
        self.assertEqual(dataframe.to_list_of_lists(), expected_result)

    @data(
        dict(query_expression=('WITH q1 as (SELECT a+1 as b FROM my_table),'
                               'q1 as (SELECT a+2 as b FROM my_table)'
                               'SELECT * from q1'),
             error='Duplicate names in WITH clauses are not allowed'), )
    @unpack
    def test_with_clause_error(self, query_expression, error):
        # type: (str, str) -> None
        query_expression_node, leftover = query_expression_rule(
            tokenize(query_expression))
        self.assertFalse(leftover)
        assert isinstance(query_expression_node, QueryExpression)
        with self.assertRaisesRegexp(ValueError, error):
            query_expression_node.get_dataframe(self.table_context)

    # constants for use in data-driven syntax tests below
    with_clause = 'with foo as (select 1), bar as (select 2)'
    select_clause = 'select 3'
    order_by_clause = 'order by a, b'
    limit_clause = 'limit 5'

    @data(
        # core query expression grammar (no recursion)
        dict(query=(with_clause, select_clause, order_by_clause, limit_clause)
             ),
        dict(query=(with_clause, select_clause, limit_clause)),
        dict(query=(with_clause, select_clause, order_by_clause)),
        dict(query=(with_clause, select_clause)),
        dict(query=(select_clause, order_by_clause, limit_clause)),
        dict(query=(select_clause, limit_clause)),
        dict(query=(select_clause, order_by_clause)),
        dict(query=(select_clause, )),

        # parenthesized recursion
        dict(query=(with_clause, '(', with_clause, select_clause,
                    order_by_clause, limit_clause, ')', order_by_clause,
                    limit_clause)),
        dict(query=(with_clause, '(', with_clause, select_clause,
                    order_by_clause, limit_clause, ')', limit_clause)),
        dict(query=(with_clause, '(', with_clause, select_clause,
                    order_by_clause, limit_clause, ')', order_by_clause)),
        dict(query=(with_clause, '(', with_clause, select_clause,
                    order_by_clause, limit_clause, ')')),

        # set operations
        dict(query=(select_clause, 'UNION ALL', select_clause)
             ),  # simplest case
        dict(query=(
            # first query expression
            with_clause,
            select_clause,
            order_by_clause,
            limit_clause,
            'UNION ALL',
            # second query expression
            with_clause,
            select_clause,
            order_by_clause,
            limit_clause)),
    )
    @unpack
    def test_query_expression_syntax(self, query):
        query_expression = ' '.join(query)
        query_expression_node, leftover = query_expression_rule(
            tokenize(query_expression))
        self.assertFalse(leftover)

    @data(
        dict(query_expression='select * from my_table order by a DESC',
             expected_result=[[3], [2], [1]]),
        dict(query_expression='select a from my_table order by 1 DESC',
             expected_result=[[3], [2], [1]]),
    )
    @unpack
    def test_query_expression_order_by(self, query_expression,
                                       expected_result):
        # type: (str, List[List[int]]) -> None
        query_expression_node, leftover = query_expression_rule(
            tokenize(query_expression))
        assert isinstance(query_expression_node, QueryExpression)
        dataframe, unused_table_name = query_expression_node.get_dataframe(
            self.table_context)
        self.assertFalse(leftover)
        self.assertEqual(dataframe.to_list_of_lists(), expected_result)

    def test_select(self):
        # type: () -> None
        from_ = DataSource((TableReference(
            ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), [])
        selector = StarSelector(EMPTY_NODE, EMPTY_NODE, EMPTY_NODE)
        select = Select(EMPTY_NODE, [selector], from_, EMPTY_NODE, EMPTY_NODE,
                        EMPTY_NODE)
        dataframe, table_name = select.get_dataframe(self.table_context)

        self.assertEqual(table_name, None)
        self.assertEqual(dataframe.to_list_of_lists(), [[1], [2], [3]])
        self.assertEqual(list(dataframe.dataframe), ['a'])
        self.assertEqual(dataframe.types, [BQScalarType.INTEGER])

    @data(
        dict(select='select c from my_table group by c',
             expected_result=[[3]]),
        dict(select='select my_table.c from my_table group by c',
             expected_result=[[3]]),
        dict(select='select c from my_table group by my_table.c',
             expected_result=[[3]]),
        dict(select='select my_table.c from my_table group by my_table.c',
             expected_result=[[3]]),
        dict(select='select c from my_table group by 1',
             expected_result=[[3]]),
        dict(select='select max(b), c from my_table group by 2',
             expected_result=[[3, 3]]),
        dict(select='select c+1 from my_table group by c',
             expected_result=[[4]]),
        dict(select='select c+1 from my_table group by 1',
             expected_result=[[4]]),
        dict(select='select c+1 from my_table group by c, 1',
             expected_result=[[4]]),
        dict(select='select c+1 as d from my_table group by c,d',
             expected_result=[[4]]),
        dict(select='select c+1 as d from my_table group by c,1',
             expected_result=[[4]]),
        dict(select='select c+1 from my_table group by c,1',
             expected_result=[[4]]),
        # Naively evaluating the below after the group by has occurred fails with
        # Column(s) my_table.c already selected
        dict(select='select c+1 as d, count(*) from my_table group by d',
             expected_result=[[4, 2]]),
        # Naively evaluating the below after the group by has occurred tries and fails to add
        # two SeriesGroupbys
        dict(select='select a+c as d from my_table group by d',
             expected_result=[[4]]),
    )
    @unpack
    def test_select_group_by(self, select, expected_result):
        # type: (str, List[List[int]]) -> None
        group_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 3, 3]],
                                                columns=['a', 'b', 'c']),
                                   types=[
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER
                                   ])
                }
            }
        })
        select_node, leftover = select_rule(tokenize(select))
        assert isinstance(select_node, Select)
        dataframe, unused_table_name = select_node.get_dataframe(
            group_table_context)
        self.assertFalse(leftover)
        self.assertEqual(dataframe.to_list_of_lists(), expected_result)

    @data(
        dict(select='select b from my_table group by c'),
        dict(select='select b,c from my_table group by c'),
        # Note: a is actually constant within the group, but it doesn't matter because you can't
        # tell that statically from the query.
        dict(select='select a from my_table group by c'),
    )
    @unpack
    def test_select_group_by_error(self, select):
        # type: (str) -> None
        group_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 3, 3]],
                                                columns=['a', 'b', 'c']),
                                   types=[
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER
                                   ])
                }
            }
        })
        select_node, leftover = select_rule(tokenize(select))
        assert isinstance(select_node, Select)
        self.assertFalse(leftover)
        with self.assertRaisesRegexp(ValueError,
                                     "not aggregated or grouped by"):
            select_node.get_dataframe(group_table_context)

    @data(
        dict(select='select distinct a from my_table', expected_result=[[1]]),
        dict(select='select distinct b from my_table',
             expected_result=[[2], [3]]),
        dict(select='select distinct a, b from my_table',
             expected_result=[[1, 2], [1, 3]]),
    )
    @unpack
    def test_select_distinct(self, select, expected_result):
        # type: (str, List[List[int]]) -> None
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[1, 2], [1, 3]], columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })
        select_node, leftover = select_rule(tokenize(select))
        assert isinstance(select_node, Select)
        dataframe, unused_table_name = select_node.get_dataframe(table_context)
        self.assertFalse(leftover)
        self.assertEqual(dataframe.to_list_of_lists(), expected_result)

    @data(
        # WHERE b = 4
        (
            BinaryExpression(Field(
                ('b', )), '=', Value(value=4, type_=BQScalarType.INTEGER)), ),
        # WHERE b = 4 AND a = 3
        (
            BinaryExpression(
                BinaryExpression(Field(
                    ('b', )), '=', Value(value=4, type_=BQScalarType.INTEGER)),
                'AND',
                BinaryExpression(Field(
                    ('a', )), '=', Value(value=3,
                                         type_=BQScalarType.INTEGER))), ))
    @unpack
    def test_select_where(self, where):
        # type: (EvaluatableNode) -> None
        where_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })

        fields = [Selector(Field(('a', )), EMPTY_NODE)]
        from_ = DataSource((TableReference(
            ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), [])
        select = Select(EMPTY_NODE, fields, from_, where, EMPTY_NODE,
                        EMPTY_NODE)
        dataframe, table_name = select.get_dataframe(where_table_context)

        self.assertEqual(dataframe.to_list_of_lists(), [[3]])

    @data(
        dict(select='SELECT sum(a) as c FROM my_table GROUP BY b HAVING c > 4',
             expected_result=[[7]]),
        dict(select='SELECT b FROM my_table GROUP BY b HAVING sum(a) > 4',
             expected_result=[[3]]),
        dict(select='SELECT sum(a) as c FROM my_table GROUP BY b HAVING b > 4',
             expected_result=[[4]]),
        dict(
            select=
            'SELECT sum(a) as c FROM my_table GROUP BY b HAVING b=1 AND c<0 AND MIN(a)<0',
            expected_result=[[-98]]),
    )
    @unpack
    def test_select_having(self, select, expected_result):
        # type: (str, List[List[int]]) -> None
        group_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame(
                            [[1, 3], [6, 3], [4, 5], [-100, 1], [2, 1]],
                            columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })

        select_node, leftover = select_rule(tokenize(select))
        assert isinstance(select_node, Select)
        dataframe, unused_table_name = select_node.get_dataframe(
            group_table_context)
        self.assertFalse(leftover)
        self.assertEqual(dataframe.to_list_of_lists(), expected_result)

    @data(
        dict(select='SELECT * EXCEPT (a) FROM table1',
             expected_result=[[8, 4], [3, 0], [10, 1]]),
        dict(select='SELECT * EXCEPT (a, c) FROM table1',
             expected_result=[[8], [3], [10]]),
        dict(select='SELECT * REPLACE (a/2 as a) FROM table1',
             expected_result=[[1, 8, 4], [3, 3, 0], [6, 10, 1]]),
        dict(select='SELECT table1.*, d FROM table1 JOIN table2 USING (a)',
             expected_result=[[2, 8, 4, 7], [6, 3, 0, 2], [12, 10, 1, 9]]),
    )
    @unpack
    def test_select_star(self, select, expected_result):
        # type: (str, List[List[int]]) -> None
        group_table_context = DatasetTableContext({
            'p': {
                'd': {
                    'table1':
                    TypedDataFrame(
                        pd.DataFrame([[2, 8, 4], [6, 3, 0], [12, 10, 1]],
                                     columns=['a', 'b', 'c']),
                        types=[
                            BQScalarType.INTEGER, BQScalarType.INTEGER,
                            BQScalarType.INTEGER
                        ]),
                    'table2':
                    TypedDataFrame(
                        pd.DataFrame([[2, 7, 3], [6, 2, -1], [12, 9, 0]],
                                     columns=['a', 'd', 'e']),
                        types=[
                            BQScalarType.INTEGER, BQScalarType.INTEGER,
                            BQScalarType.INTEGER
                        ]),
                }
            }
        })

        select_node, leftover = select_rule(tokenize(select))
        assert isinstance(select_node, Select)
        dataframe, unused_table_name = select_node.get_dataframe(
            group_table_context)
        self.assertFalse(leftover)
        self.assertEqual(dataframe.to_list_of_lists(), expected_result)

    @data((('my_project', 'my_dataset', 'my_table'), ),
          (('my_dataset', 'my_table'), ), (('my_table', ), ),
          (('my_project.my_dataset.my_table', ), ))
    @unpack
    def test_table_reference(self, reference):
        # type: (Tuple[str, ...]) -> None
        table_ref = TableReference(reference)
        dataframe, table_name = table_ref.get_dataframe(self.table_context)

        self.assertEqual(table_name, 'my_table')
        self.assertEqual(dataframe.to_list_of_lists(), [[1], [2], [3]])
        self.assertEqual(list(dataframe.dataframe), ['a'])
        self.assertEqual(dataframe.types, [BQScalarType.INTEGER])

    def test_table_reference_multi_project(self):
        # type: () -> None
        new_table_context = DatasetTableContext({
            'project1': {
                'dataset1': {
                    'table1': TypedDataFrame(pd.DataFrame(), [])
                }
            },
            'project2': {
                'dataset2': {
                    'table2': TypedDataFrame(pd.DataFrame(), [])
                }
            }
        })
        table_ref = TableReference(('dataset1', 'table1'))
        expected_error = "Non-fully-qualified table \\('dataset1', 'table1'\\) with multiple "\
            "possible projects \\['project1', 'project2'\\]"
        with self.assertRaisesRegexp(ValueError, expected_error):
            table_ref.get_dataframe(new_table_context)

    def test_table_reference_multi_dataset(self):
        # type: () -> None
        new_table_context = DatasetTableContext({
            'project1': {
                'dataset1': {
                    'table1': TypedDataFrame(pd.DataFrame(), [])
                },
                'dataset2': {
                    'table2': TypedDataFrame(pd.DataFrame(), [])
                }
            },
        })
        table_ref = TableReference(('table1', ))
        expected_error = "Non-fully-qualified table \\('table1',\\) with multiple possible "\
            "datasets \\['dataset1', 'dataset2'\\]"
        with self.assertRaisesRegexp(ValueError, expected_error):
            table_ref.get_dataframe(new_table_context)

    @data(
        dict(
            query='SELECT * from UNNEST([])',
            result=[],
            # UNNEST always returns one column unless the result is a struct.
            result_columns=['f0_']),
        dict(query='SELECT * from UNNEST([1, 2, 3])',
             result=[[1], [2], [3]],
             result_columns=['f0_']),
        dict(query='SELECT * from UNNEST(["a", "b", "c"])',
             result=[['a'], ['b'], ['c']],
             result_columns=['f0_']),
        dict(query='SELECT * from UNNEST([(1, "a"), (2, "b")])',
             result=[[1, 'a'], [2, 'b']],
             result_columns=['f0_', 'f1_']),
        dict(
            query=
            'SELECT x, y from UNNEST([STRUCT<x INTEGER, y STRING>(1, "a"), (2, "b")])',
            result=[[1, 'a'], [2, 'b']],
            result_columns=['x', 'y']),
    )
    @unpack
    def test_unnest(self, query, result, result_columns):
        # type: (str, List[List[Any]], List[str]) -> None
        node, leftover = query_expression_rule(tokenize(query))

        self.assertFalse(leftover)
        assert isinstance(node, QueryExpression)

        dataframe, _ = node.get_dataframe(TableContext())

        self.assertEqual(dataframe.to_list_of_lists(), result)
        self.assertEqual(list(dataframe.dataframe.columns), result_columns)

    @data(
        dict(query='SELECT * from UNNEST([1, "a"])',
             expected_error='Cannot implicitly coerce the given types'), )
    @unpack
    def test_unnest_error(self, query, expected_error):
        # type: (str, str) -> None
        node, leftover = query_expression_rule(tokenize(query))

        self.assertFalse(leftover)
        assert isinstance(node, QueryExpression)

        with self.assertRaisesRegexp(ValueError, expected_error):
            node.get_dataframe(TableContext())

Example #17

0

Show file

File: terminals_test.py Project: verilylifesciences/purplequery

 def test_literals_false(self):
     '''Check FALSE and also lower to upper conversion'''
     self.assertEqual(
         literal(['false']),
         (Value(False, BQScalarType.BOOLEAN), []))

Example #18

0

Show file

File: terminals_test.py Project: verilylifesciences/purplequery

 def test_literals_true(self):
     '''Check TRUE and also that remainer gets returned'''
     self.assertEqual(
         literal(['TRUE', 'abc']),
         (Value(True, BQScalarType.BOOLEAN), ['abc']))

Example #19

0

Show file

File: terminals_test.py Project: verilylifesciences/purplequery

 def test_literals_null(self):
     '''Check NULL'''
     self.assertEqual(
         literal(['NULL']),
         (Value(None, None), []))

Example #20

0

Show file

File: terminals_test.py Project: verilylifesciences/purplequery

 def test_literals_string(self):
     '''Check strings'''
     self.assertEqual(
         literal(['"something"']),
         (Value('something', BQScalarType.STRING), []))

Example #21

0

Show file

File: terminals_test.py Project: verilylifesciences/purplequery

 def test_literals_integer(self):
     '''Check integers'''
     self.assertEqual(
         literal(['11']),
         (Value(11, BQScalarType.INTEGER), []))

Example #22

0

Show file

File: terminals_test.py Project: verilylifesciences/purplequery

 def test_literals_float(self):
     '''Check floats'''
     self.assertEqual(
         literal(['1.23']),
         (Value(1.23, BQScalarType.FLOAT), []))

Example #23

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

    def test_not_type_error(self):
        expression = Value(5, BQScalarType.INTEGER)
        not_expression = Not(expression)

        with self.assertRaisesRegexp(ValueError, ""):
            not_expression.evaluate(EMPTY_CONTEXT)

Example #24

0

Show file

class QueryHelperTest(unittest.TestCase):

    @data(dict(rule='SELECT',
               tokens=['SELECT', '*', 'FROM', 'TABLE'],
               result=('SELECT', ['*', 'FROM', 'TABLE']),
               comment='Rule defined by string'),

          dict(rule=('ORDER', 'BY'),
               tokens=['ORDER', 'BY', 'SomeField'],
               result=((), ['SomeField']),
               comment='Rule defined by tuple'),

          dict(rule=['FROM', 'WHERE'],
               tokens=['FROM', 'SomeTable'],
               result=('FROM', ['SomeTable']),
               comment='Rule defined by list'),

          dict(rule=['FROM', 'WHERE', None],
               tokens=['SomethingElse'],
               result=(EMPTY_NODE, ['SomethingElse']),
               comment='Rule defined by optional list'),

          dict(rule=None,
               tokens=['a', 'b', 'c'],
               result=(EMPTY_NODE, ['a', 'b', 'c']),
               comment='Rule is None'),

          dict(rule=literal,
               tokens=['1.23'],
               result=(Value(1.23, BQScalarType.FLOAT), []),
               comment='Rule defined by a method'),

          dict(rule=(TestNode, identifier, identifier),
               tokens=['a', 'b'],
               result=(TestNode('a', 'b'), []),
               comment='Rule defined by an abstract syntax tree node'),

          dict(rule='SELECT',
               tokens=['WHERE'],
               result=(None, ['WHERE']),
               comment='Rule defined by string does not match'),

          dict(rule=('ORDER', 'BY'),
               tokens=['ORDER', 'WITH', 'SomeField'],
               result=(None, ['ORDER', 'WITH', 'SomeField']),
               comment='Rule defined by tuple does not match'),

          dict(rule=['FROM', 'WHERE'],
               tokens=['SomeTable'],
               result=(None, ['SomeTable']),
               comment='Rule defined by list does not match'))
    @unpack
    def test_apply_rule(self, rule,  # type: RuleType
                        tokens,  # type: List[str]
                        result,  # type: AppliedRuleOutputType
                        comment  # type: str
                        ):
        # test: (...) -> None
        self.assertEqual(apply_rule(rule, tokens), result)

    @data(dict(rule=identifier,
               separator=['ASC', 'DESC'],
               tokens=['a', 'ASC', 'b', 'DESC'],
               result=(('a', 'b'), []),
               comment='Rule is a method, separator is either ASC or DESC'),

          dict(rule=literal,
               separator=',',
               tokens=['1', ',', '2', ',', '3'],
               result=((Value(1, BQScalarType.INTEGER),
                        Value(2, BQScalarType.INTEGER),
                        Value(3, BQScalarType.INTEGER)), []),
               comment='Rule is a method, separator is a comma'),

          dict(rule=(identifier, '=', identifier),
               separator='AND',
               tokens=['field1', '=', 'field2', 'AND', 'field3', '=', 'field4'],
               result=((('field1', 'field2'), ('field3', 'field4')), []),
               comment='Rule is a more complex tuple, separator is AND'))
    @unpack
    def test_separated_sequence(self, rule,  # type: RuleType
                                separator,  # type: RuleType
                                tokens,  # type: List[str]
                                result,  # type: AppliedRuleOutputType
                                comment  # type: str
                                ):
        # type: (...) -> None
        self.assertEqual(separated_sequence(rule, separator)(tokens), result)

    def test_separated_sequence_keep_separator(self):
        '''Test keep_separator parameter'''
        sequence_check = separated_sequence(identifier, ',', keep_separator=True)
        self.assertEqual(sequence_check(['a', ',', 'b']),
                         (('a', ',', 'b'), []))

    def test_separated_sequence_wrapper(self):
        '''Test wrapper parameter'''
        sequence_check = separated_sequence(identifier, ',', wrapper=list)
        self.assertEqual(sequence_check(['a', ',', 'b']),
                         (['a', 'b'], []))

Example #25

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

        typed_series = case.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), result)

    def test_case_no_whens(self):
        comparand = EMPTY_NODE
        whens = []
        else_ = EMPTY_NODE

        error = "Must provide at least one WHEN for a CASE"
        with self.assertRaisesRegexp(ValueError, error):
            Case(comparand, whens, else_)

    @data(
        dict(comparand=EMPTY_NODE,
             whens=[(Value(1, BQScalarType.INTEGER),
                     Value("one", BQScalarType.STRING))],
             else_=EMPTY_NODE,
             error="CASE condition isn't boolean! Found: {!r}".format(
                 TypedSeries(pd.Series([1, 1]), BQScalarType.INTEGER))),
        dict(comparand=Field(('a', )),
             whens=[(Value(1, BQScalarType.INTEGER),
                     Value("one", BQScalarType.STRING))],
             else_=Value(100, BQScalarType.INTEGER),
             error="Cannot implicitly coerce the given types: "
             "(BQScalarType.STRING, BQScalarType.INTEGER)"),
    )
    @unpack
    def test_case_error(
            self,
            comparand,  # type: Union[_EmptyNode, EvaluatableNode]

Example #26

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

class EvaluatableNodeTest(unittest.TestCase):
    def setUp(self):
        # type: () -> None
        self.small_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']),
                                   types=[BQScalarType.INTEGER])
                }
            }
        })

        self.large_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 4, 3]],
                                                columns=['a', 'b', 'c']),
                                   types=[
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER
                                   ])
                }
            }
        })

    def test_selector(self):
        # type: () -> None
        selector = Selector(Field(('a', )), 'field_alias')
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = selector.evaluate(context)
        assert isinstance(typed_series, TypedSeries)

        self.assertEqual(list(typed_series.series), [1, 2])
        self.assertEqual(list(typed_series.dataframe), ['field_alias'])
        self.assertEqual(typed_series.types, [BQScalarType.INTEGER])

    def test_selector_group_by_success(self):
        # type: () -> None
        selector = Selector(Field(('c', )), EMPTY_NODE)
        selector.position = 1
        context = EvaluationContext(self.large_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)

        context.exclude_aggregation = True
        updated_selector, = context.do_group_by([selector],
                                                [Field(('my_table', 'c'))])

        typed_series = updated_selector.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [3])

    @data((5, BQScalarType.INTEGER), (1.23, BQScalarType.FLOAT),
          ("something", BQScalarType.STRING), (True, BQScalarType.BOOLEAN),
          (None, None))
    @unpack
    def test_value_repr(self, value, type_):
        # type: (Optional[LiteralType], Optional[BQScalarType]) -> None
        '''Check Value's string representation'''
        node = Value(value, type_)
        representation = 'Value(type_={}, value={})'.format(
            type_.__repr__(), value.__repr__())
        self.assertEqual(node.__repr__(), representation)

    @data((5, None), (None, BQScalarType.INTEGER))
    @unpack
    def test_invalid_value(self, value, type_):
        # type: (Optional[LiteralType], Optional[BQScalarType]) -> None
        '''Check that None is only allowed as both value and type_ or neither.'''
        with self.assertRaises(ValueError):
            Value(value, type_)

    def test_value_eval(self):
        # type: () -> None
        # A constant is repeated for each row in the context table.
        value = Value(12345, BQScalarType.INTEGER)
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')), 'foo')
        typed_series = value.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [12345, 12345])

    def test_field(self):
        # type: () -> None
        field = Field(('a', ))
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = field.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [1, 2])
        self.assertEqual(typed_series.series.name, 'a')

    @data(
        dict(function_name='sum',
             args=[Field(('a', ))],
             expected_result=[3],
             is_aggregating=True),
        dict(function_name='max',
             args=[Field(('a', ))],
             expected_result=[2],
             is_aggregating=True),
        dict(function_name='min',
             args=[Field(('a', ))],
             expected_result=[1],
             is_aggregating=True),
        dict(function_name='concat',
             args=[
                 Value('foo', BQScalarType.STRING),
                 Value('bar', BQScalarType.STRING)
             ],
             expected_result=['foobar'] *
             2),  # two copies to match length of context table.
        dict(function_name='mod',
             args=[Field(('a', )),
                   Value(2, BQScalarType.INTEGER)],
             expected_result=[1, 0]),
        dict(function_name='mod',
             args=[
                 Value(1.0, BQScalarType.FLOAT),
                 Value(2, BQScalarType.INTEGER)
             ],
             expected_result=[1.0, 1.0]),
        dict(function_name='timestamp',
             args=[Value("2019-04-22", BQScalarType.STRING)],
             expected_result=[datetime.datetime(2019, 4, 22)] *
             2),  # two copies to match table len
    )
    @unpack
    def test_functions(self,
                       function_name,
                       args,
                       expected_result,
                       is_aggregating=False):
        # type: (str, List[EvaluatableNode], List[PythonType], bool) -> None
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        if is_aggregating:
            context.do_group_by((), [])
        result = FunctionCall.create(function_name, args,
                                     EMPTY_NODE).evaluate(context)
        assert isinstance(result, TypedSeries)
        self.assertEqual([result.type_.convert(elt) for elt in result.series],
                         expected_result)

    def test_current_timestamp(self):
        # type: () -> None
        node, leftover = apply_rule(
            query_expression,
            tokenize(
                'select current_timestamp(), a from unnest([struct(1 as a), struct(2), struct(3)])'
            ))
        assert isinstance(node, QueryExpression)
        self.assertFalse(leftover)
        result, _ = node.get_dataframe(DatasetTableContext({}))
        table = cast(List[List[datetime.datetime]], result.to_list_of_lists())
        self.assertEqual(len(table), 3)
        # CURRENT_TIMESTAMP() returns a very recent timestamp
        self.assertLess((datetime.datetime.now() - table[0][0]).seconds, 2)
        # All rows have the same timestamp value.
        self.assertEqual(table[0][0], table[1][0])
        self.assertEqual(table[0][0], table[2][0])

    @data(
        # These expressions are ones whose EvaluatableNode subclass constructs a
        # new pandas Series rather than computing on existing ones.  See below:
        # this runs the risk of constructing it with an incorrect index.
        dict(query='select 10, c', expected_result=[[10, 6], [10, 9]]),
        dict(query='select [a, b], c',
             expected_result=[[(4, 5), 6], [(7, 8), 9]]),
        dict(query='select (a, b), c',
             expected_result=[[(4, 5), 6], [(7, 8), 9]]),
        dict(query='select exists(select 1), c',
             expected_result=[[True, 6], [True, 9]]),
        dict(query='select a in (1, 4), c',
             expected_result=[[True, 6], [False, 9]]),
        dict(query='select row_number() over (), c',
             expected_result=[[1, 6], [2, 9]]),
        dict(query='select current_timestamp() > timestamp("2019-01-01"), c',
             expected_result=[[True, 6], [True, 9]]),
    )
    @unpack
    def test_constructed_column_has_correct_index(self, query,
                                                  expected_result):
        # type: (str, List[List[int]]) -> None
        '''Checks that manually constructed columns have the same index as the data.

        A manually constructed column will usually have an index 0, 1, 2, ...
        (e.g. pd.Series(['a', 'b', 'c']) has index 0, 1, 2).
        The data may not; filtering, sorting or other changes might result in an index of
        different numbers.  If one column's index doesn't match the index of other columns,
        it can't be compared or joined with them properly.
        '''
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[1, 2, -1], [4, 5, 6], [7, 8, 9]],
                                     columns=['a', 'b', 'c']),
                        types=[
                            BQScalarType.INTEGER, BQScalarType.INTEGER,
                            BQScalarType.INTEGER
                        ])
                }
            }
        })

        # Skip the first row of the table, so that the index of the table that
        # the test queries operate on is [1, 2]; this makes sure that the index is
        # different from the default index you would get for a two-row column,
        # which would be [0, 1], to test that expressions are not incorrectly
        # using that default index.
        node, leftover = select_rule(
            tokenize(query + ' from (select * from my_table where c > 0)'))
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        self.assertEqual(result.to_list_of_lists(), expected_result)
        self.assertEqual(list(result.dataframe.index), [1, 2])

    def test_bad_function(self):
        # type: () -> None
        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        with self.assertRaisesRegexp(NotImplementedError,
                                     'NOT_A_FUNCTION not implemented'):
            FunctionCall.create('not_a_function', [],
                                EMPTY_NODE).evaluate(context)

    @data(
        # Explore each aggregate function, along with a non-aggregate function to make sure we
        # can compute both at once.
        dict(selectors='sum(a), b+10', expected_result=[[6, 11], [5, 12]]),
        dict(selectors='sum(a), 20+10', expected_result=[[6, 30], [5, 30]]),
        dict(selectors='sum(a+1), b+10', expected_result=[[8, 11], [6, 12]]),
        dict(selectors='max(a), b+10', expected_result=[[4, 11], [5, 12]]),
        dict(selectors='min(a), b+10', expected_result=[[2, 11], [5, 12]]),
        dict(selectors='count(a), b+10', expected_result=[[2, 11], [1, 12]]),
        dict(selectors='count(*), b+10', expected_result=[[2, 11], [2, 12]]),
        dict(selectors='array_agg(a), []',
             expected_result=[[(2, 4), ()], [(5, None), ()]]),
        dict(selectors='array_agg(a), [b]',
             expected_result=[[(2, 4), (1, )], [(5, None), (2, )]]),
        dict(selectors='array_agg(a), [7, 8]',
             expected_result=[[(2, 4), (7, 8)], [(5, None), (7, 8)]]),
        dict(selectors='array_agg(a), b+10',
             expected_result=[[(2, 4), 11], [(5, None), 12]]),
    )
    @unpack
    def test_aggregate_functions_in_group_by(self, selectors, expected_result):
        # type: (str, List[List[int]]) -> None
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[2, 1], [4, 1], [5, 2], [np.nan, 2]],
                                     columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })

        tokens = tokenize(
            'select {} from my_table group by b'.format(selectors))
        node, leftover = select_rule(tokens)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        self.assertEqual(result.to_list_of_lists(), expected_result)

    @data(
        dict(query='select sum(a + 1) + 2, count(*) + 3, 4 from my_table',
             expected_result=[[11, 6, 4]]), )
    @unpack
    def test_aggregate_functions_in_expressions(self, query, expected_result):
        # type: (str, List[List[int]]) -> None
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1], [2], [3]],
                                                columns=['a']),
                                   types=[BQScalarType.INTEGER])
                }
            }
        })

        node, leftover = select_rule(tokenize(query))
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        self.assertEqual(result.to_list_of_lists(), expected_result)

    @data(
        # Test all variations of creating a struct (typed, typeless, tuple),
        # with and without named fields, with one field, and then with two
        # fields.
        dict(query='SELECT STRUCT<INTEGER>(1)',
             expected_result=(1, ),
             expected_type=BQStructType([None], [BQScalarType.INTEGER])),
        dict(query='SELECT STRUCT<a INTEGER>(1)',
             expected_result=(1, ),
             expected_type=BQStructType(['a'], [BQScalarType.INTEGER])),
        dict(query='SELECT STRUCT(1 AS a)',
             expected_result=(1, ),
             expected_type=BQStructType(['a'], [BQScalarType.INTEGER])),
        dict(query='SELECT STRUCT(1)',
             expected_result=(1, ),
             expected_type=BQStructType([None], [BQScalarType.INTEGER])),
        # Note: no test of single-element tuple syntax, as that would just be a
        # parenthesized expression, there's no analogue to Python's trailing comma.
        dict(query='SELECT STRUCT<INTEGER, STRING>(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, None], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT<a INTEGER, STRING>(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 ['a', None], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT<INTEGER, b STRING>(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT<a INTEGER, b STRING>(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 ['a', 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT(1 AS a, "a" as b)',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 ['a', 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT(1, "a" as b)',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT(1 AS a, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 ['a', None], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT STRUCT(1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, None], [BQScalarType.INTEGER, BQScalarType.STRING])),
        dict(query='SELECT (1, "a")',
             expected_result=(1, 'a'),
             expected_type=BQStructType(
                 [None, None], [BQScalarType.INTEGER, BQScalarType.STRING])),
    )
    @unpack
    def test_struct_constant_expressions(self, query, expected_result,
                                         expected_type):
        # type: (str, Tuple[Optional[int], ...], BQStructType) -> None
        table_context = DatasetTableContext({})
        node, leftover = select_rule(tokenize(query))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertEqual(result.to_list_of_lists(), [[expected_result]])
        self.assertEqual(result.types, [expected_type])

    @data(
        # Test all three struct syntaxes, selecting a column as one field, a
        # constant as the other.
        dict(query='SELECT (a, "a") FROM my_table',
             expected_result=[[(1, 'a')], [(2, 'a')]],
             expected_types=[
                 BQStructType([None, None],
                              [BQScalarType.INTEGER, BQScalarType.STRING])
             ]),
        dict(query='SELECT STRUCT(a as x, "a" as y) FROM my_table',
             expected_result=[[(1, 'a')], [(2, 'a')]],
             expected_types=[
                 BQStructType(['x', 'y'],
                              [BQScalarType.INTEGER, BQScalarType.STRING])
             ]),
        dict(query='SELECT STRUCT<x INTEGER, y STRING>(a, "a") FROM my_table',
             expected_result=[[(1, 'a')], [(2, 'a')]],
             expected_types=[
                 BQStructType(['x', 'y'],
                              [BQScalarType.INTEGER, BQScalarType.STRING])
             ]),
    )
    @unpack
    def test_struct_field_and_constant(self, query, expected_result,
                                       expected_types):
        # type: (str, List[List[Tuple[Optional[int], ...]]], Sequence[BQStructType]) -> None
        node, leftover = select_rule(tokenize(query))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(
            self.small_table_context)
        self.assertEqual(result.to_list_of_lists(), expected_result)
        self.assertEqual(result.types, expected_types)

    @data(
        # Test combination types of arrays and structs.
        dict(query='SELECT ([1], "a")',
             expected_result=((1, ), 'a'),
             expected_type=BQStructType(
                 [None, None],
                 [BQArray(BQScalarType.INTEGER), BQScalarType.STRING])),
        dict(
            query=
            'SELECT STRUCT<x ARRAY<INTEGER>, y STRING>(ARRAY<INTEGER>[1], "a")',
            expected_result=((1, ), 'a'),
            expected_type=BQStructType(
                ['x', 'y'],
                [BQArray(BQScalarType.INTEGER), BQScalarType.STRING])),
        dict(query='SELECT [(1, "a")]',
             expected_result=((1, 'a'), ),
             expected_type=BQArray(
                 BQStructType([None, None],
                              [BQScalarType.INTEGER, BQScalarType.STRING]))),
        dict(query='SELECT [STRUCT<a INTEGER, b STRING>(1, "a"), (2, "b")]',
             expected_result=((1, 'a'), (2, 'b')),
             expected_type=BQArray(
                 BQStructType(['a', 'b'],
                              [BQScalarType.INTEGER, BQScalarType.STRING]))),
        # Test that an array of structs merges and coerces the types of the
        # structs.
        dict(
            query=
            'SELECT [STRUCT<a FLOAT, STRING>(1.0, "a"), STRUCT<INTEGER, b STRING>(2, "b")]',
            expected_result=((1.0, 'a'), (2.0, 'b')),
            expected_type=BQArray(
                BQStructType(['a', 'b'],
                             [BQScalarType.FLOAT, BQScalarType.STRING]))),
        dict(
            query=
            'SELECT [STRUCT<a INTEGER, b ARRAY<STRING> >(1, ["a"]), (2, ["b", "c"])]',
            expected_result=((1, ('a', )), (2, ('b', 'c'))),
            expected_type=BQArray(
                BQStructType(
                    ['a', 'b'],
                    [BQScalarType.INTEGER,
                     BQArray(BQScalarType.STRING)]))),
    )
    @unpack
    def test_complex_types(self, query, expected_result, expected_type):
        # type: (str, Tuple[Optional[int], ...], BQType) -> None
        table_context = DatasetTableContext({})
        node, leftover = select_rule(tokenize(query))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertEqual(result.to_list_of_lists(), [[expected_result]])
        self.assertEqual(result.types, [expected_type])

    @data(
        dict(query='SELECT ARRAY_AGG(a)', expected_result=(1, 1, 2, None)),
        dict(query='SELECT ARRAY_AGG(a RESPECT NULLS)',
             expected_result=(1, 1, 2, None)),
        dict(query='SELECT ARRAY_AGG(DISTINCT a)',
             expected_result=(1, 2, None)),
        dict(query='SELECT ARRAY_AGG(DISTINCT a RESPECT NULLS)',
             expected_result=(1, 2, None)),
        dict(query='SELECT ARRAY_AGG(a IGNORE NULLS)',
             expected_result=(1, 1, 2)),
        dict(query='SELECT ARRAY_AGG(DISTINCT a IGNORE NULLS)',
             expected_result=(1, 2)),
    )
    @unpack
    def test_array_agg_arguments(self, query, expected_result):
        # type: (str, Tuple[Optional[int], ...]) -> None
        table_context = DatasetTableContext({
            'p': {
                'd': {
                    't':
                    TypedDataFrame(pd.DataFrame([[1], [1], [2], [None]],
                                                columns=['a']),
                                   types=[BQScalarType.INTEGER])
                }
            }
        })

        node, leftover = select_rule(tokenize(query + ' FROM p.d.t'))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertEqual(result.to_list_of_lists(), [[expected_result]])

    @data(
        dict(query='SELECT [1,2,"a"]',
             error='Cannot implicitly coerce the given types'),
        dict(
            query='SELECT STRUCT<INT64>(3.7)',
            error=
            'Struct field 1 has type .*FLOAT which does not coerce to .*INTEGER'
        ),
        dict(
            query='SELECT ARRAY<INT64>[3.7]',
            error=
            'Array specifies type .*INTEGER, incompatible with values of type .*FLOAT'
        ),
        dict(query='SELECT ARRAY<INT64>[1,2,"a"]',
             error='Cannot implicitly coerce the given types'),
        dict(query='SELECT ARRAY<string>[1,2]',
             error='Cannot implicitly coerce the given types'),
        dict(query='SELECT [[1]]', error='Cannot create arrays of arrays'),
        dict(query='SELECT [(1, 2), (3, 4, 5)]',
             error='Cannot merge .* number of fields varies'),
        dict(query='SELECT [STRUCT(1 as a, 2 as b), STRUCT(3 as x, 4 as b)]',
             error='Cannot merge Structs; field names .* do not match'),
        # same types in different orders can't merge.
        dict(query='SELECT [(1, "a"), ("b", 2)]',
             error='Cannot implicitly coerce the given types'),
        # same names in different orders can't merge
        dict(query='SELECT [STRUCT(1 as a, 2 as b), STRUCT(3 as b, 4 as a)]',
             error='Cannot merge Structs; field names .* do not match'),
    )
    @unpack
    def test_complex_type_errors(self, query, error):
        # type: (str, str) -> None
        node, leftover = select_rule(tokenize(query))
        self.assertFalse(leftover)
        assert isinstance(node, Select)
        with self.assertRaisesRegexp(ValueError, error):
            node.get_dataframe(self.small_table_context)

    @data(
        # Row number over whole dataset; order is not guaranteed
        dict(selectors='row_number() over ()',
             expected_result=[[1], [2], [3], [4]]),
        dict(selectors='row_number() over (order by a), a',
             expected_result=[[1, 10], [2, 20], [3, 30], [4, 30]]),
        dict(selectors='row_number() over (order by a asc), a',
             expected_result=[[1, 10], [2, 20], [3, 30], [4, 30]]),
        dict(selectors='row_number() over (order by a desc), a',
             expected_result=[[4, 10], [3, 20], [2, 30], [1, 30]]),
        dict(selectors='row_number() over (partition by b order by a), a',
             expected_result=[[1, 10], [2, 20], [1, 30], [2, 30]]),
        dict(selectors='sum(a) over (), a',
             expected_result=[[90, 10], [90, 20], [90, 30], [90, 30]]),
        dict(selectors='sum(a) over (partition by b), a',
             expected_result=[[30, 10], [30, 20], [60, 30], [60, 30]]),
        dict(selectors='count(*) over (), a',
             expected_result=[[4, 10], [4, 20], [4, 30], [4, 30]]),
        dict(selectors='count(a) over (), a',
             expected_result=[[4, 10], [4, 20], [4, 30], [4, 30]]),
        dict(selectors='count(*) over (partition by b), a',
             expected_result=[[2, 10], [2, 20], [2, 30], [2, 30]]),
        dict(selectors='count(a) over (partition by b), a',
             expected_result=[[2, 10], [2, 20], [2, 30], [2, 30]]),
        dict(selectors='sum(count(*)) over ()', expected_result=[[4]]),
    )
    @unpack
    def test_analytic_function(self, selectors, expected_result):
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame(
                            [[20, 200], [10, 200], [30, 300], [30, 300]],
                            columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })
        tokens = tokenize('select {} from my_table'.format(selectors))
        node, leftover = select_rule(tokens)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        # Note: BQ docs say if ORDER BY clause (for the select as a whole) is not present, order of
        # results is undefined, so we do not assert on the order.
        six.assertCountEqual(self, result.to_list_of_lists(), expected_result)

    @data(
        dict(selectors='sum(count(*)) over (), count(*)',
             expected_result=[[5, 2], [5, 3]]), )
    @unpack
    def test_analytic_function_with_group_by(self, selectors, expected_result):
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame(
                            [[20, 2], [10, 2], [30, 3], [31, 3], [32, 3]],
                            columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })
        tokens = tokenize(
            'select {} from my_table group by b'.format(selectors))
        node, leftover = select_rule(tokens)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertFalse(leftover)
        # Note: BQ docs say if ORDER BY clause (for the select as a whole) is not present, order of
        # results is undefined, so we do not assert on the order.
        six.assertCountEqual(self, result.to_list_of_lists(), expected_result)

    def test_non_aggregate_function_in_group_by(self):
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([['one', '1'], ['two', '1'],
                                      ['three', '2'], ['four', '2']],
                                     columns=['a', 'b']),
                        types=[BQScalarType.STRING, BQScalarType.INTEGER])
                }
            }
        })

        tokens = tokenize(
            'select max(concat(b, "hi")) from my_table group by b')
        node, leftover = select_rule(tokens)
        self.assertFalse(leftover)
        result, unused_table_name = node.get_dataframe(table_context)
        self.assertEqual(result.to_list_of_lists(), [['1hi'], ['2hi']])

    @data(
        dict(count='COUNT(*)', expected_result=[[2]]),
        dict(count='COUNT(c)', expected_result=[[2]]),
        dict(count='COUNT(DISTINCT c)', expected_result=[[1]]),
        dict(count='COUNT(b)', expected_result=[[2]]),
        dict(count='COUNT(DISTINCT b)', expected_result=[[2]]),
        dict(count='COUNT(a)', expected_result=[[1]]),
    )
    @unpack
    def test_count(self, count, expected_result):
        # type: (str, List[List[int]]) -> None
        count_table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(pd.DataFrame([[1, 2, 3], [None, 4, 3]],
                                                columns=['a', 'b', 'c']),
                                   types=[
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER,
                                       BQScalarType.INTEGER
                                   ])
                }
            }
        })
        select, leftover = select_rule(
            tokenize('SELECT {} FROM my_table'.format(count)))
        self.assertFalse(leftover)
        assert isinstance(select, Select)
        dataframe, unused_table_name = select.get_dataframe(
            count_table_context)
        self.assertEqual(dataframe.to_list_of_lists(), expected_result)

    @data(('IS_NULL', [True, False]), ('IS_NOT_NULL', [False, True]))
    @unpack
    def test_null_check(self, direction, result):
        # type: (str, List[bool]) -> None
        table_context = DatasetTableContext({
            'my_project': {
                'my_dataset': {
                    'my_table':
                    TypedDataFrame(
                        pd.DataFrame([[1, None], [2, 3]], columns=['a', 'b']),
                        types=[BQScalarType.INTEGER, BQScalarType.INTEGER])
                }
            }
        })

        context = EvaluationContext(table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        expression = Field(('b', ))
        null_check = NullCheck(expression, direction)

        typed_series = null_check.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), result)

    @data(('IN', [True, False]), ('NOT_IN', [False, True]))
    @unpack
    def test_in_check(self, direction, result):
        # type: (str, List[bool]) -> None
        expression = Field(('a', ))
        elements = (Value(1, type_=BQScalarType.INTEGER),
                    Value(3, type_=BQScalarType.INTEGER))
        in_check = InCheck(expression, direction, elements)

        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = in_check.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), result)

    @data((True, 0), (False, 1))
    @unpack
    def test_if_empty_context(self, condition_bool, result):
        # type: (bool, int) -> None
        condition = Value(condition_bool, BQScalarType.BOOLEAN)
        then = Value(0, BQScalarType.INTEGER)
        else_ = Value(1, BQScalarType.INTEGER)
        # IF [condition] THEN 0 ELSE 1
        if_expression = If(condition, then, else_)

        typed_series = if_expression.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [result])

    def test_if(self):
        condition = BinaryExpression(Field(('a', )), '>',
                                     Value(1, BQScalarType.INTEGER))
        then = Value('yes', BQScalarType.STRING)
        else_ = Value('no', BQScalarType.STRING)
        # IF a > 1 THEN "yes" ELSE "no"
        if_expression = If(condition, then, else_)

        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = if_expression.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), ['no', 'yes'])

    def test_if_different_types(self):
        condition = Value(True, BQScalarType.BOOLEAN)
        then = Value('yes', BQScalarType.STRING)
        else_ = Value(1, BQScalarType.INTEGER)
        if_expression = If(condition, then, else_)

        error = (r"Cannot implicitly coerce the given types: "
                 r"\(BQScalarType.STRING, BQScalarType.INTEGER\)")
        with self.assertRaisesRegexp(ValueError, error):
            if_expression.evaluate(EMPTY_CONTEXT)

    def test_if_error(self):
        condition = Value(5, BQScalarType.INTEGER)
        then = Value(0, BQScalarType.INTEGER)
        else_ = Value(1, BQScalarType.INTEGER)
        if_expression = If(condition, then, else_)

        error = escape("IF condition isn't boolean! Found: {}".format(
            str(condition.evaluate(EMPTY_CONTEXT))))
        with self.assertRaisesRegexp(ValueError, error):
            if_expression.evaluate(EMPTY_CONTEXT)

    def test_not(self):
        expression = Value(True, BQScalarType.BOOLEAN)
        not_expression = Not(expression)

        typed_series = not_expression.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [False])

    def test_not_type_error(self):
        expression = Value(5, BQScalarType.INTEGER)
        not_expression = Not(expression)

        with self.assertRaisesRegexp(ValueError, ""):
            not_expression.evaluate(EMPTY_CONTEXT)

    @data(
        (1, BQScalarType.INTEGER, -1),
        (1.0, BQScalarType.FLOAT, -1.0),
    )
    @unpack
    def test_unary_negation(self, initial_value, value_type, result_value):
        # type: (Any, BQScalarType, Any) -> None
        expression = Value(initial_value, value_type)
        negation = UnaryNegation(expression)

        typed_series = negation.evaluate(EMPTY_CONTEXT)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), [result_value])

    @data(
        ("abc", BQScalarType.STRING),
        (True, BQScalarType.BOOLEAN),
    )
    @unpack
    def test_unary_negation_error(self, value, value_type):
        # type: (Any, BQScalarType) -> None
        expression = Value(value, value_type)
        negation = UnaryNegation(expression)

        error = (
            "UnaryNegation expression supports only integers and floats, got: {}"
            .format(value_type))
        with self.assertRaisesRegexp(TypeError, error):
            negation.evaluate(EMPTY_CONTEXT)

    @data(
        dict(comparand=Field(('a', )),
             whens=[(Value(1, BQScalarType.INTEGER),
                     Value("one", BQScalarType.STRING)),
                    (Value(2, BQScalarType.INTEGER),
                     Value("two", BQScalarType.STRING))],
             else_=Value("other", BQScalarType.STRING),
             result=["one", "two"]),
        dict(comparand=Field(('a', )),
             whens=[(Value(1, BQScalarType.INTEGER),
                     Value("one", BQScalarType.STRING))],
             else_=Value("other", BQScalarType.STRING),
             result=["one", "other"]),
        dict(comparand=EMPTY_NODE,
             whens=[(Value(True, BQScalarType.BOOLEAN),
                     Value("yes", BQScalarType.STRING)),
                    (Value(False, BQScalarType.BOOLEAN),
                     Value("no", BQScalarType.STRING))],
             else_=EMPTY_NODE,
             result=["yes", "yes"]),
        dict(comparand=Field(('a', )),
             whens=[(Value(1, BQScalarType.INTEGER),
                     Value("one", BQScalarType.STRING))],
             else_=EMPTY_NODE,
             result=["one", None]),
    )
    @unpack
    def test_case_with_comparand(
            self,
            comparand,  # type: Union[_EmptyNode, EvaluatableNode]
            whens,  # type: List[Tuple[AbstractSyntaxTreeNode, EvaluatableNode]]
            else_,  # type: EvaluatableNode
            result  # type: List[str]
    ):
        # type: (...) -> None
        case = Case(comparand, whens, else_)

        context = EvaluationContext(self.small_table_context)
        context.add_table_from_node(
            TableReference(('my_project', 'my_dataset', 'my_table')),
            EMPTY_NODE)
        typed_series = case.evaluate(context)
        assert isinstance(typed_series, TypedSeries)
        self.assertEqual(list(typed_series.series), result)

Example #27

0

Show file

File: evaluatable_node_test.py Project: verilylifesciences/purplequery

 def test_invalid_value(self, value, type_):
     # type: (Optional[LiteralType], Optional[BQScalarType]) -> None
     '''Check that None is only allowed as both value and type_ or neither.'''
     with self.assertRaises(ValueError):
         Value(value, type_)