def test_value_repr(self, value, type_): # type: (Optional[LiteralType], Optional[BQScalarType]) -> None '''Check Value's string representation''' node = Value(value, type_) representation = 'Value(type_={}, value={})'.format( type_.__repr__(), value.__repr__()) self.assertEqual(node.__repr__(), representation)
def test_not_an_operator(self): with self.assertRaisesRegexp(ValueError, 'Unknown operator string bar'): _reparse_binary_expression([ Value(3, BQScalarType.INTEGER), 'bar', Value(4, BQScalarType.INTEGER), ])
def test_value_eval(self): # type: () -> None # A constant is repeated for each row in the context table. value = Value(12345, BQScalarType.INTEGER) context = EvaluationContext(self.small_table_context) context.add_table_from_node(TableReference(('my_project', 'my_dataset', 'my_table')), 'foo') typed_series = value.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [12345, 12345])
def test_if_different_types(self): condition = Value(True, BQScalarType.BOOLEAN) then = Value('yes', BQScalarType.STRING) else_ = Value(1, BQScalarType.INTEGER) if_expression = If(condition, then, else_) error = (r"Cannot implicitly coerce the given types: " r"\(BQScalarType.STRING, BQScalarType.INTEGER\)") with self.assertRaisesRegexp(ValueError, error): if_expression.evaluate(EMPTY_CONTEXT)
def test_if_error(self): condition = Value(5, BQScalarType.INTEGER) then = Value(0, BQScalarType.INTEGER) else_ = Value(1, BQScalarType.INTEGER) if_expression = If(condition, then, else_) error = escape("IF condition isn't boolean! Found: {}".format( str(condition.evaluate(EMPTY_CONTEXT)))) with self.assertRaisesRegexp(ValueError, error): if_expression.evaluate(EMPTY_CONTEXT)
def test_if_empty_context(self, condition_bool, result): # type: (bool, int) -> None condition = Value(condition_bool, BQScalarType.BOOLEAN) then = Value(0, BQScalarType.INTEGER) else_ = Value(1, BQScalarType.INTEGER) # IF [condition] THEN 0 ELSE 1 if_expression = If(condition, then, else_) typed_series = if_expression.evaluate(EMPTY_CONTEXT) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [result])
def test_in_check(self, direction, result): # type: (str, List[bool]) -> None expression = Field(('a',)) elements = (Value(1, type_=BQScalarType.INTEGER), Value(3, type_=BQScalarType.INTEGER)) in_check = InCheck(expression, direction, elements) context = EvaluationContext(self.small_table_context) context.add_table_from_node(TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) typed_series = in_check.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), result)
def test_if(self): condition = BinaryExpression(Field(('a',)), '>', Value(1, BQScalarType.INTEGER)) then = Value('yes', BQScalarType.STRING) else_ = Value('no', BQScalarType.STRING) # IF a > 1 THEN "yes" ELSE "no" if_expression = If(condition, then, else_) context = EvaluationContext(self.small_table_context) context.add_table_from_node(TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) typed_series = if_expression.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), ['no', 'yes'])
def test_query_expression_limit(self): # type: () -> None from_ = DataSource((TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), []) selector = StarSelector(EMPTY_NODE, EMPTY_NODE, EMPTY_NODE) select = Select(EMPTY_NODE, [selector], from_, EMPTY_NODE, EMPTY_NODE, EMPTY_NODE) limit = Value(1, BQScalarType.INTEGER) offset = Value(1, BQScalarType.INTEGER) qe = QueryExpression(EMPTY_NODE, select, EMPTY_NODE, (limit, offset)) dataframe, table_name = qe.get_dataframe(self.table_context) self.assertEqual(dataframe.to_list_of_lists(), [[2]])
def test_extract(self, part, result): # type: (str, int) -> None extract = Extract(part, Value(pd.Timestamp('2019-05-09'), BQScalarType.TIMESTAMP)) typed_series = extract.evaluate(EMPTY_CONTEXT) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [result])
def test_not(self): expression = Value(True, BQScalarType.BOOLEAN) not_expression = Not(expression) typed_series = not_expression.evaluate(EMPTY_CONTEXT) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [False])
def test_unary_negation(self, initial_value, value_type, result_value): # type: (Any, BQScalarType, Any) -> None expression = Value(initial_value, value_type) negation = UnaryNegation(expression) typed_series = negation.evaluate(EMPTY_CONTEXT) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [result_value])
def test_extract_unimplemented(self): extract = Extract( 'WEEK(TUESDAY)', Value(pd.Timestamp('2019-05-09'), BQScalarType.TIMESTAMP)) with self.assertRaisesRegexp(NotImplementedError, r'WEEK\(TUESDAY\) not implemented'): extract.evaluate(EMPTY_CONTEXT)
def test_unary_negation_error(self, value, value_type): # type: (Any, BQScalarType) -> None expression = Value(value, value_type) negation = UnaryNegation(expression) error = ("UnaryNegation expression supports only integers and floats, got: {}" .format(value_type)) with self.assertRaisesRegexp(TypeError, error): negation.evaluate(EMPTY_CONTEXT)
def test_even_length_sequence_raises(self): with self.assertRaisesRegexp(ValueError, 'Sequence must be of odd length'): _reparse_binary_expression([Value(3, BQScalarType.INTEGER), '+'])
class DataframeNodeTest(unittest.TestCase): def setUp(self): # type: () -> None self.table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2], [3]], columns=['a']), types=[BQScalarType.INTEGER]) } } }) def test_marker_syntax_tree_node(self): # type: () -> None self.assertEqual(Select.literal(), 'SELECT') def test_query_expression(self): # type: () -> None from_ = DataSource((TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), []) selector = StarSelector(EMPTY_NODE, EMPTY_NODE, EMPTY_NODE) select = Select(EMPTY_NODE, [selector], from_, EMPTY_NODE, EMPTY_NODE, EMPTY_NODE) qe = QueryExpression(EMPTY_NODE, select, EMPTY_NODE, EMPTY_NODE) dataframe, table_name = qe.get_dataframe(self.table_context) self.assertEqual(table_name, None) self.assertEqual(dataframe.to_list_of_lists(), [[1], [2], [3]]) self.assertEqual(list(dataframe.dataframe), ['a']) self.assertEqual(dataframe.types, [BQScalarType.INTEGER]) def test_query_expression_limit(self): # type: () -> None from_ = DataSource((TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), []) selector = StarSelector(EMPTY_NODE, EMPTY_NODE, EMPTY_NODE) select = Select(EMPTY_NODE, [selector], from_, EMPTY_NODE, EMPTY_NODE, EMPTY_NODE) limit = Value(1, BQScalarType.INTEGER) offset = Value(1, BQScalarType.INTEGER) qe = QueryExpression(EMPTY_NODE, select, EMPTY_NODE, (limit, offset)) dataframe, table_name = qe.get_dataframe(self.table_context) self.assertEqual(dataframe.to_list_of_lists(), [[2]]) @data( dict(query_expression= 'select a as d from my_table union all select c from my_table', expected_result=[[1], [3]]), dict( query_expression='select * from my_table union all select 4, 5, 6', expected_result=[[1, 2, 3], [4, 5, 6]]), dict(query_expression='select 1 union all select 2.0', expected_result=[[1.0], [2.0]]), ) @unpack def test_query_expression_set_operation(self, query_expression, expected_result): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']), [ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) dataframe, unused_table_name = query_expression_node.get_dataframe( table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result) @data( dict(query_expression='select 1 union all select 2, 3', error='mismatched column count: 1 vs 2'), dict(query_expression='select 1 union all select "foo"', error='Cannot implicitly coerce the given types'), ) @unpack def test_query_expression_set_operation_error(self, query_expression, error): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']), [ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) self.assertFalse(leftover) with self.assertRaisesRegexp(ValueError, error): query_expression_node.get_dataframe(table_context) @data( dict(query_expression= 'WITH q1 as (SELECT a+1 as b FROM my_table) SELECT * from q1', expected_result=[[2], [3], [4]]), # This example is closely based on # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#with-clause dict( query_expression=( 'WITH q1 AS (SELECT a+1 as b FROM my_table LIMIT 1)' # q1 is [[2]] 'SELECT *' 'FROM' ' (WITH q2 AS (SELECT b+1 as c FROM q1),' # q1 resolves to [[2]] ' q3 AS (SELECT b+2 as d FROM q1),' # q1 resolves to [[2]] ' q1 AS (SELECT b+4 as e FROM q1),' # q1 (in the query) resolves to [[2]] ' q4 AS (SELECT e+8 as f FROM q1)' # q1 resolves to the WITH subquery # on the previous line. ' SELECT e, c, d, f FROM q1, q2, q3, q4)' ), # q1 resolves to 3rd inner WITH subquery. expected_result=[[6, 3, 4, 14]], ), ) @unpack def test_with_clause(self, query_expression, expected_result): # type: (str, List[List[int]]) -> None query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) self.assertFalse(leftover) assert isinstance(query_expression_node, QueryExpression) dataframe, _ = query_expression_node.get_dataframe(self.table_context) self.assertEqual(dataframe.to_list_of_lists(), expected_result) @data( dict(query_expression=('WITH q1 as (SELECT a+1 as b FROM my_table),' 'q1 as (SELECT a+2 as b FROM my_table)' 'SELECT * from q1'), error='Duplicate names in WITH clauses are not allowed'), ) @unpack def test_with_clause_error(self, query_expression, error): # type: (str, str) -> None query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) self.assertFalse(leftover) assert isinstance(query_expression_node, QueryExpression) with self.assertRaisesRegexp(ValueError, error): query_expression_node.get_dataframe(self.table_context) # constants for use in data-driven syntax tests below with_clause = 'with foo as (select 1), bar as (select 2)' select_clause = 'select 3' order_by_clause = 'order by a, b' limit_clause = 'limit 5' @data( # core query expression grammar (no recursion) dict(query=(with_clause, select_clause, order_by_clause, limit_clause) ), dict(query=(with_clause, select_clause, limit_clause)), dict(query=(with_clause, select_clause, order_by_clause)), dict(query=(with_clause, select_clause)), dict(query=(select_clause, order_by_clause, limit_clause)), dict(query=(select_clause, limit_clause)), dict(query=(select_clause, order_by_clause)), dict(query=(select_clause, )), # parenthesized recursion dict(query=(with_clause, '(', with_clause, select_clause, order_by_clause, limit_clause, ')', order_by_clause, limit_clause)), dict(query=(with_clause, '(', with_clause, select_clause, order_by_clause, limit_clause, ')', limit_clause)), dict(query=(with_clause, '(', with_clause, select_clause, order_by_clause, limit_clause, ')', order_by_clause)), dict(query=(with_clause, '(', with_clause, select_clause, order_by_clause, limit_clause, ')')), # set operations dict(query=(select_clause, 'UNION ALL', select_clause) ), # simplest case dict(query=( # first query expression with_clause, select_clause, order_by_clause, limit_clause, 'UNION ALL', # second query expression with_clause, select_clause, order_by_clause, limit_clause)), ) @unpack def test_query_expression_syntax(self, query): query_expression = ' '.join(query) query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) self.assertFalse(leftover) @data( dict(query_expression='select * from my_table order by a DESC', expected_result=[[3], [2], [1]]), dict(query_expression='select a from my_table order by 1 DESC', expected_result=[[3], [2], [1]]), ) @unpack def test_query_expression_order_by(self, query_expression, expected_result): # type: (str, List[List[int]]) -> None query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) assert isinstance(query_expression_node, QueryExpression) dataframe, unused_table_name = query_expression_node.get_dataframe( self.table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result) def test_select(self): # type: () -> None from_ = DataSource((TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), []) selector = StarSelector(EMPTY_NODE, EMPTY_NODE, EMPTY_NODE) select = Select(EMPTY_NODE, [selector], from_, EMPTY_NODE, EMPTY_NODE, EMPTY_NODE) dataframe, table_name = select.get_dataframe(self.table_context) self.assertEqual(table_name, None) self.assertEqual(dataframe.to_list_of_lists(), [[1], [2], [3]]) self.assertEqual(list(dataframe.dataframe), ['a']) self.assertEqual(dataframe.types, [BQScalarType.INTEGER]) @data( dict(select='select c from my_table group by c', expected_result=[[3]]), dict(select='select my_table.c from my_table group by c', expected_result=[[3]]), dict(select='select c from my_table group by my_table.c', expected_result=[[3]]), dict(select='select my_table.c from my_table group by my_table.c', expected_result=[[3]]), dict(select='select c from my_table group by 1', expected_result=[[3]]), dict(select='select max(b), c from my_table group by 2', expected_result=[[3, 3]]), dict(select='select c+1 from my_table group by c', expected_result=[[4]]), dict(select='select c+1 from my_table group by 1', expected_result=[[4]]), dict(select='select c+1 from my_table group by c, 1', expected_result=[[4]]), dict(select='select c+1 as d from my_table group by c,d', expected_result=[[4]]), dict(select='select c+1 as d from my_table group by c,1', expected_result=[[4]]), dict(select='select c+1 from my_table group by c,1', expected_result=[[4]]), # Naively evaluating the below after the group by has occurred fails with # Column(s) my_table.c already selected dict(select='select c+1 as d, count(*) from my_table group by d', expected_result=[[4, 2]]), # Naively evaluating the below after the group by has occurred tries and fails to add # two SeriesGroupbys dict(select='select a+c as d from my_table group by d', expected_result=[[4]]), ) @unpack def test_select_group_by(self, select, expected_result): # type: (str, List[List[int]]) -> None group_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 3, 3]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe( group_table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result) @data( dict(select='select b from my_table group by c'), dict(select='select b,c from my_table group by c'), # Note: a is actually constant within the group, but it doesn't matter because you can't # tell that statically from the query. dict(select='select a from my_table group by c'), ) @unpack def test_select_group_by_error(self, select): # type: (str) -> None group_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 3, 3]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) self.assertFalse(leftover) with self.assertRaisesRegexp(ValueError, "not aggregated or grouped by"): select_node.get_dataframe(group_table_context) @data( dict(select='select distinct a from my_table', expected_result=[[1]]), dict(select='select distinct b from my_table', expected_result=[[2], [3]]), dict(select='select distinct a, b from my_table', expected_result=[[1, 2], [1, 3]]), ) @unpack def test_select_distinct(self, select, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2], [1, 3]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result) @data( # WHERE b = 4 ( BinaryExpression(Field( ('b', )), '=', Value(value=4, type_=BQScalarType.INTEGER)), ), # WHERE b = 4 AND a = 3 ( BinaryExpression( BinaryExpression(Field( ('b', )), '=', Value(value=4, type_=BQScalarType.INTEGER)), 'AND', BinaryExpression(Field( ('a', )), '=', Value(value=3, type_=BQScalarType.INTEGER))), )) @unpack def test_select_where(self, where): # type: (EvaluatableNode) -> None where_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) fields = [Selector(Field(('a', )), EMPTY_NODE)] from_ = DataSource((TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), []) select = Select(EMPTY_NODE, fields, from_, where, EMPTY_NODE, EMPTY_NODE) dataframe, table_name = select.get_dataframe(where_table_context) self.assertEqual(dataframe.to_list_of_lists(), [[3]]) @data( dict(select='SELECT sum(a) as c FROM my_table GROUP BY b HAVING c > 4', expected_result=[[7]]), dict(select='SELECT b FROM my_table GROUP BY b HAVING sum(a) > 4', expected_result=[[3]]), dict(select='SELECT sum(a) as c FROM my_table GROUP BY b HAVING b > 4', expected_result=[[4]]), dict( select= 'SELECT sum(a) as c FROM my_table GROUP BY b HAVING b=1 AND c<0 AND MIN(a)<0', expected_result=[[-98]]), ) @unpack def test_select_having(self, select, expected_result): # type: (str, List[List[int]]) -> None group_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame( [[1, 3], [6, 3], [4, 5], [-100, 1], [2, 1]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe( group_table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result) @data( dict(select='SELECT * EXCEPT (a) FROM table1', expected_result=[[8, 4], [3, 0], [10, 1]]), dict(select='SELECT * EXCEPT (a, c) FROM table1', expected_result=[[8], [3], [10]]), dict(select='SELECT * REPLACE (a/2 as a) FROM table1', expected_result=[[1, 8, 4], [3, 3, 0], [6, 10, 1]]), dict(select='SELECT table1.*, d FROM table1 JOIN table2 USING (a)', expected_result=[[2, 8, 4, 7], [6, 3, 0, 2], [12, 10, 1, 9]]), ) @unpack def test_select_star(self, select, expected_result): # type: (str, List[List[int]]) -> None group_table_context = DatasetTableContext({ 'p': { 'd': { 'table1': TypedDataFrame( pd.DataFrame([[2, 8, 4], [6, 3, 0], [12, 10, 1]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), 'table2': TypedDataFrame( pd.DataFrame([[2, 7, 3], [6, 2, -1], [12, 9, 0]], columns=['a', 'd', 'e']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe( group_table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result) @data((('my_project', 'my_dataset', 'my_table'), ), (('my_dataset', 'my_table'), ), (('my_table', ), ), (('my_project.my_dataset.my_table', ), )) @unpack def test_table_reference(self, reference): # type: (Tuple[str, ...]) -> None table_ref = TableReference(reference) dataframe, table_name = table_ref.get_dataframe(self.table_context) self.assertEqual(table_name, 'my_table') self.assertEqual(dataframe.to_list_of_lists(), [[1], [2], [3]]) self.assertEqual(list(dataframe.dataframe), ['a']) self.assertEqual(dataframe.types, [BQScalarType.INTEGER]) def test_table_reference_multi_project(self): # type: () -> None new_table_context = DatasetTableContext({ 'project1': { 'dataset1': { 'table1': TypedDataFrame(pd.DataFrame(), []) } }, 'project2': { 'dataset2': { 'table2': TypedDataFrame(pd.DataFrame(), []) } } }) table_ref = TableReference(('dataset1', 'table1')) expected_error = "Non-fully-qualified table \\('dataset1', 'table1'\\) with multiple "\ "possible projects \\['project1', 'project2'\\]" with self.assertRaisesRegexp(ValueError, expected_error): table_ref.get_dataframe(new_table_context) def test_table_reference_multi_dataset(self): # type: () -> None new_table_context = DatasetTableContext({ 'project1': { 'dataset1': { 'table1': TypedDataFrame(pd.DataFrame(), []) }, 'dataset2': { 'table2': TypedDataFrame(pd.DataFrame(), []) } }, }) table_ref = TableReference(('table1', )) expected_error = "Non-fully-qualified table \\('table1',\\) with multiple possible "\ "datasets \\['dataset1', 'dataset2'\\]" with self.assertRaisesRegexp(ValueError, expected_error): table_ref.get_dataframe(new_table_context) @data( dict( query='SELECT * from UNNEST([])', result=[], # UNNEST always returns one column unless the result is a struct. result_columns=['f0_']), dict(query='SELECT * from UNNEST([1, 2, 3])', result=[[1], [2], [3]], result_columns=['f0_']), dict(query='SELECT * from UNNEST(["a", "b", "c"])', result=[['a'], ['b'], ['c']], result_columns=['f0_']), dict(query='SELECT * from UNNEST([(1, "a"), (2, "b")])', result=[[1, 'a'], [2, 'b']], result_columns=['f0_', 'f1_']), dict( query= 'SELECT x, y from UNNEST([STRUCT<x INTEGER, y STRING>(1, "a"), (2, "b")])', result=[[1, 'a'], [2, 'b']], result_columns=['x', 'y']), ) @unpack def test_unnest(self, query, result, result_columns): # type: (str, List[List[Any]], List[str]) -> None node, leftover = query_expression_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, QueryExpression) dataframe, _ = node.get_dataframe(TableContext()) self.assertEqual(dataframe.to_list_of_lists(), result) self.assertEqual(list(dataframe.dataframe.columns), result_columns) @data( dict(query='SELECT * from UNNEST([1, "a"])', expected_error='Cannot implicitly coerce the given types'), ) @unpack def test_unnest_error(self, query, expected_error): # type: (str, str) -> None node, leftover = query_expression_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, QueryExpression) with self.assertRaisesRegexp(ValueError, expected_error): node.get_dataframe(TableContext())
def test_literals_false(self): '''Check FALSE and also lower to upper conversion''' self.assertEqual( literal(['false']), (Value(False, BQScalarType.BOOLEAN), []))
def test_literals_true(self): '''Check TRUE and also that remainer gets returned''' self.assertEqual( literal(['TRUE', 'abc']), (Value(True, BQScalarType.BOOLEAN), ['abc']))
def test_literals_null(self): '''Check NULL''' self.assertEqual( literal(['NULL']), (Value(None, None), []))
def test_literals_string(self): '''Check strings''' self.assertEqual( literal(['"something"']), (Value('something', BQScalarType.STRING), []))
def test_literals_integer(self): '''Check integers''' self.assertEqual( literal(['11']), (Value(11, BQScalarType.INTEGER), []))
def test_literals_float(self): '''Check floats''' self.assertEqual( literal(['1.23']), (Value(1.23, BQScalarType.FLOAT), []))
def test_not_type_error(self): expression = Value(5, BQScalarType.INTEGER) not_expression = Not(expression) with self.assertRaisesRegexp(ValueError, ""): not_expression.evaluate(EMPTY_CONTEXT)
class QueryHelperTest(unittest.TestCase): @data(dict(rule='SELECT', tokens=['SELECT', '*', 'FROM', 'TABLE'], result=('SELECT', ['*', 'FROM', 'TABLE']), comment='Rule defined by string'), dict(rule=('ORDER', 'BY'), tokens=['ORDER', 'BY', 'SomeField'], result=((), ['SomeField']), comment='Rule defined by tuple'), dict(rule=['FROM', 'WHERE'], tokens=['FROM', 'SomeTable'], result=('FROM', ['SomeTable']), comment='Rule defined by list'), dict(rule=['FROM', 'WHERE', None], tokens=['SomethingElse'], result=(EMPTY_NODE, ['SomethingElse']), comment='Rule defined by optional list'), dict(rule=None, tokens=['a', 'b', 'c'], result=(EMPTY_NODE, ['a', 'b', 'c']), comment='Rule is None'), dict(rule=literal, tokens=['1.23'], result=(Value(1.23, BQScalarType.FLOAT), []), comment='Rule defined by a method'), dict(rule=(TestNode, identifier, identifier), tokens=['a', 'b'], result=(TestNode('a', 'b'), []), comment='Rule defined by an abstract syntax tree node'), dict(rule='SELECT', tokens=['WHERE'], result=(None, ['WHERE']), comment='Rule defined by string does not match'), dict(rule=('ORDER', 'BY'), tokens=['ORDER', 'WITH', 'SomeField'], result=(None, ['ORDER', 'WITH', 'SomeField']), comment='Rule defined by tuple does not match'), dict(rule=['FROM', 'WHERE'], tokens=['SomeTable'], result=(None, ['SomeTable']), comment='Rule defined by list does not match')) @unpack def test_apply_rule(self, rule, # type: RuleType tokens, # type: List[str] result, # type: AppliedRuleOutputType comment # type: str ): # test: (...) -> None self.assertEqual(apply_rule(rule, tokens), result) @data(dict(rule=identifier, separator=['ASC', 'DESC'], tokens=['a', 'ASC', 'b', 'DESC'], result=(('a', 'b'), []), comment='Rule is a method, separator is either ASC or DESC'), dict(rule=literal, separator=',', tokens=['1', ',', '2', ',', '3'], result=((Value(1, BQScalarType.INTEGER), Value(2, BQScalarType.INTEGER), Value(3, BQScalarType.INTEGER)), []), comment='Rule is a method, separator is a comma'), dict(rule=(identifier, '=', identifier), separator='AND', tokens=['field1', '=', 'field2', 'AND', 'field3', '=', 'field4'], result=((('field1', 'field2'), ('field3', 'field4')), []), comment='Rule is a more complex tuple, separator is AND')) @unpack def test_separated_sequence(self, rule, # type: RuleType separator, # type: RuleType tokens, # type: List[str] result, # type: AppliedRuleOutputType comment # type: str ): # type: (...) -> None self.assertEqual(separated_sequence(rule, separator)(tokens), result) def test_separated_sequence_keep_separator(self): '''Test keep_separator parameter''' sequence_check = separated_sequence(identifier, ',', keep_separator=True) self.assertEqual(sequence_check(['a', ',', 'b']), (('a', ',', 'b'), [])) def test_separated_sequence_wrapper(self): '''Test wrapper parameter''' sequence_check = separated_sequence(identifier, ',', wrapper=list) self.assertEqual(sequence_check(['a', ',', 'b']), (['a', 'b'], []))
typed_series = case.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), result) def test_case_no_whens(self): comparand = EMPTY_NODE whens = [] else_ = EMPTY_NODE error = "Must provide at least one WHEN for a CASE" with self.assertRaisesRegexp(ValueError, error): Case(comparand, whens, else_) @data( dict(comparand=EMPTY_NODE, whens=[(Value(1, BQScalarType.INTEGER), Value("one", BQScalarType.STRING))], else_=EMPTY_NODE, error="CASE condition isn't boolean! Found: {!r}".format( TypedSeries(pd.Series([1, 1]), BQScalarType.INTEGER))), dict(comparand=Field(('a', )), whens=[(Value(1, BQScalarType.INTEGER), Value("one", BQScalarType.STRING))], else_=Value(100, BQScalarType.INTEGER), error="Cannot implicitly coerce the given types: " "(BQScalarType.STRING, BQScalarType.INTEGER)"), ) @unpack def test_case_error( self, comparand, # type: Union[_EmptyNode, EvaluatableNode]
class EvaluatableNodeTest(unittest.TestCase): def setUp(self): # type: () -> None self.small_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']), types=[BQScalarType.INTEGER]) } } }) self.large_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 4, 3]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) def test_selector(self): # type: () -> None selector = Selector(Field(('a', )), 'field_alias') context = EvaluationContext(self.small_table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) typed_series = selector.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [1, 2]) self.assertEqual(list(typed_series.dataframe), ['field_alias']) self.assertEqual(typed_series.types, [BQScalarType.INTEGER]) def test_selector_group_by_success(self): # type: () -> None selector = Selector(Field(('c', )), EMPTY_NODE) selector.position = 1 context = EvaluationContext(self.large_table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) context.exclude_aggregation = True updated_selector, = context.do_group_by([selector], [Field(('my_table', 'c'))]) typed_series = updated_selector.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [3]) @data((5, BQScalarType.INTEGER), (1.23, BQScalarType.FLOAT), ("something", BQScalarType.STRING), (True, BQScalarType.BOOLEAN), (None, None)) @unpack def test_value_repr(self, value, type_): # type: (Optional[LiteralType], Optional[BQScalarType]) -> None '''Check Value's string representation''' node = Value(value, type_) representation = 'Value(type_={}, value={})'.format( type_.__repr__(), value.__repr__()) self.assertEqual(node.__repr__(), representation) @data((5, None), (None, BQScalarType.INTEGER)) @unpack def test_invalid_value(self, value, type_): # type: (Optional[LiteralType], Optional[BQScalarType]) -> None '''Check that None is only allowed as both value and type_ or neither.''' with self.assertRaises(ValueError): Value(value, type_) def test_value_eval(self): # type: () -> None # A constant is repeated for each row in the context table. value = Value(12345, BQScalarType.INTEGER) context = EvaluationContext(self.small_table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), 'foo') typed_series = value.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [12345, 12345]) def test_field(self): # type: () -> None field = Field(('a', )) context = EvaluationContext(self.small_table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) typed_series = field.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [1, 2]) self.assertEqual(typed_series.series.name, 'a') @data( dict(function_name='sum', args=[Field(('a', ))], expected_result=[3], is_aggregating=True), dict(function_name='max', args=[Field(('a', ))], expected_result=[2], is_aggregating=True), dict(function_name='min', args=[Field(('a', ))], expected_result=[1], is_aggregating=True), dict(function_name='concat', args=[ Value('foo', BQScalarType.STRING), Value('bar', BQScalarType.STRING) ], expected_result=['foobar'] * 2), # two copies to match length of context table. dict(function_name='mod', args=[Field(('a', )), Value(2, BQScalarType.INTEGER)], expected_result=[1, 0]), dict(function_name='mod', args=[ Value(1.0, BQScalarType.FLOAT), Value(2, BQScalarType.INTEGER) ], expected_result=[1.0, 1.0]), dict(function_name='timestamp', args=[Value("2019-04-22", BQScalarType.STRING)], expected_result=[datetime.datetime(2019, 4, 22)] * 2), # two copies to match table len ) @unpack def test_functions(self, function_name, args, expected_result, is_aggregating=False): # type: (str, List[EvaluatableNode], List[PythonType], bool) -> None context = EvaluationContext(self.small_table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) if is_aggregating: context.do_group_by((), []) result = FunctionCall.create(function_name, args, EMPTY_NODE).evaluate(context) assert isinstance(result, TypedSeries) self.assertEqual([result.type_.convert(elt) for elt in result.series], expected_result) def test_current_timestamp(self): # type: () -> None node, leftover = apply_rule( query_expression, tokenize( 'select current_timestamp(), a from unnest([struct(1 as a), struct(2), struct(3)])' )) assert isinstance(node, QueryExpression) self.assertFalse(leftover) result, _ = node.get_dataframe(DatasetTableContext({})) table = cast(List[List[datetime.datetime]], result.to_list_of_lists()) self.assertEqual(len(table), 3) # CURRENT_TIMESTAMP() returns a very recent timestamp self.assertLess((datetime.datetime.now() - table[0][0]).seconds, 2) # All rows have the same timestamp value. self.assertEqual(table[0][0], table[1][0]) self.assertEqual(table[0][0], table[2][0]) @data( # These expressions are ones whose EvaluatableNode subclass constructs a # new pandas Series rather than computing on existing ones. See below: # this runs the risk of constructing it with an incorrect index. dict(query='select 10, c', expected_result=[[10, 6], [10, 9]]), dict(query='select [a, b], c', expected_result=[[(4, 5), 6], [(7, 8), 9]]), dict(query='select (a, b), c', expected_result=[[(4, 5), 6], [(7, 8), 9]]), dict(query='select exists(select 1), c', expected_result=[[True, 6], [True, 9]]), dict(query='select a in (1, 4), c', expected_result=[[True, 6], [False, 9]]), dict(query='select row_number() over (), c', expected_result=[[1, 6], [2, 9]]), dict(query='select current_timestamp() > timestamp("2019-01-01"), c', expected_result=[[True, 6], [True, 9]]), ) @unpack def test_constructed_column_has_correct_index(self, query, expected_result): # type: (str, List[List[int]]) -> None '''Checks that manually constructed columns have the same index as the data. A manually constructed column will usually have an index 0, 1, 2, ... (e.g. pd.Series(['a', 'b', 'c']) has index 0, 1, 2). The data may not; filtering, sorting or other changes might result in an index of different numbers. If one column's index doesn't match the index of other columns, it can't be compared or joined with them properly. ''' table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2, -1], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) # Skip the first row of the table, so that the index of the table that # the test queries operate on is [1, 2]; this makes sure that the index is # different from the default index you would get for a two-row column, # which would be [0, 1], to test that expressions are not incorrectly # using that default index. node, leftover = select_rule( tokenize(query + ' from (select * from my_table where c > 0)')) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(result.to_list_of_lists(), expected_result) self.assertEqual(list(result.dataframe.index), [1, 2]) def test_bad_function(self): # type: () -> None context = EvaluationContext(self.small_table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) with self.assertRaisesRegexp(NotImplementedError, 'NOT_A_FUNCTION not implemented'): FunctionCall.create('not_a_function', [], EMPTY_NODE).evaluate(context) @data( # Explore each aggregate function, along with a non-aggregate function to make sure we # can compute both at once. dict(selectors='sum(a), b+10', expected_result=[[6, 11], [5, 12]]), dict(selectors='sum(a), 20+10', expected_result=[[6, 30], [5, 30]]), dict(selectors='sum(a+1), b+10', expected_result=[[8, 11], [6, 12]]), dict(selectors='max(a), b+10', expected_result=[[4, 11], [5, 12]]), dict(selectors='min(a), b+10', expected_result=[[2, 11], [5, 12]]), dict(selectors='count(a), b+10', expected_result=[[2, 11], [1, 12]]), dict(selectors='count(*), b+10', expected_result=[[2, 11], [2, 12]]), dict(selectors='array_agg(a), []', expected_result=[[(2, 4), ()], [(5, None), ()]]), dict(selectors='array_agg(a), [b]', expected_result=[[(2, 4), (1, )], [(5, None), (2, )]]), dict(selectors='array_agg(a), [7, 8]', expected_result=[[(2, 4), (7, 8)], [(5, None), (7, 8)]]), dict(selectors='array_agg(a), b+10', expected_result=[[(2, 4), 11], [(5, None), 12]]), ) @unpack def test_aggregate_functions_in_group_by(self, selectors, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[2, 1], [4, 1], [5, 2], [np.nan, 2]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) tokens = tokenize( 'select {} from my_table group by b'.format(selectors)) node, leftover = select_rule(tokens) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(result.to_list_of_lists(), expected_result) @data( dict(query='select sum(a + 1) + 2, count(*) + 3, 4 from my_table', expected_result=[[11, 6, 4]]), ) @unpack def test_aggregate_functions_in_expressions(self, query, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2], [3]], columns=['a']), types=[BQScalarType.INTEGER]) } } }) node, leftover = select_rule(tokenize(query)) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(result.to_list_of_lists(), expected_result) @data( # Test all variations of creating a struct (typed, typeless, tuple), # with and without named fields, with one field, and then with two # fields. dict(query='SELECT STRUCT<INTEGER>(1)', expected_result=(1, ), expected_type=BQStructType([None], [BQScalarType.INTEGER])), dict(query='SELECT STRUCT<a INTEGER>(1)', expected_result=(1, ), expected_type=BQStructType(['a'], [BQScalarType.INTEGER])), dict(query='SELECT STRUCT(1 AS a)', expected_result=(1, ), expected_type=BQStructType(['a'], [BQScalarType.INTEGER])), dict(query='SELECT STRUCT(1)', expected_result=(1, ), expected_type=BQStructType([None], [BQScalarType.INTEGER])), # Note: no test of single-element tuple syntax, as that would just be a # parenthesized expression, there's no analogue to Python's trailing comma. dict(query='SELECT STRUCT<INTEGER, STRING>(1, "a")', expected_result=(1, 'a'), expected_type=BQStructType( [None, None], [BQScalarType.INTEGER, BQScalarType.STRING])), dict(query='SELECT STRUCT<a INTEGER, STRING>(1, "a")', expected_result=(1, 'a'), expected_type=BQStructType( ['a', None], [BQScalarType.INTEGER, BQScalarType.STRING])), dict(query='SELECT STRUCT<INTEGER, b STRING>(1, "a")', expected_result=(1, 'a'), expected_type=BQStructType( [None, 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])), dict(query='SELECT STRUCT<a INTEGER, b STRING>(1, "a")', expected_result=(1, 'a'), expected_type=BQStructType( ['a', 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])), dict(query='SELECT STRUCT(1 AS a, "a" as b)', expected_result=(1, 'a'), expected_type=BQStructType( ['a', 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])), dict(query='SELECT STRUCT(1, "a" as b)', expected_result=(1, 'a'), expected_type=BQStructType( [None, 'b'], [BQScalarType.INTEGER, BQScalarType.STRING])), dict(query='SELECT STRUCT(1 AS a, "a")', expected_result=(1, 'a'), expected_type=BQStructType( ['a', None], [BQScalarType.INTEGER, BQScalarType.STRING])), dict(query='SELECT STRUCT(1, "a")', expected_result=(1, 'a'), expected_type=BQStructType( [None, None], [BQScalarType.INTEGER, BQScalarType.STRING])), dict(query='SELECT (1, "a")', expected_result=(1, 'a'), expected_type=BQStructType( [None, None], [BQScalarType.INTEGER, BQScalarType.STRING])), ) @unpack def test_struct_constant_expressions(self, query, expected_result, expected_type): # type: (str, Tuple[Optional[int], ...], BQStructType) -> None table_context = DatasetTableContext({}) node, leftover = select_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[expected_result]]) self.assertEqual(result.types, [expected_type]) @data( # Test all three struct syntaxes, selecting a column as one field, a # constant as the other. dict(query='SELECT (a, "a") FROM my_table', expected_result=[[(1, 'a')], [(2, 'a')]], expected_types=[ BQStructType([None, None], [BQScalarType.INTEGER, BQScalarType.STRING]) ]), dict(query='SELECT STRUCT(a as x, "a" as y) FROM my_table', expected_result=[[(1, 'a')], [(2, 'a')]], expected_types=[ BQStructType(['x', 'y'], [BQScalarType.INTEGER, BQScalarType.STRING]) ]), dict(query='SELECT STRUCT<x INTEGER, y STRING>(a, "a") FROM my_table', expected_result=[[(1, 'a')], [(2, 'a')]], expected_types=[ BQStructType(['x', 'y'], [BQScalarType.INTEGER, BQScalarType.STRING]) ]), ) @unpack def test_struct_field_and_constant(self, query, expected_result, expected_types): # type: (str, List[List[Tuple[Optional[int], ...]]], Sequence[BQStructType]) -> None node, leftover = select_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe( self.small_table_context) self.assertEqual(result.to_list_of_lists(), expected_result) self.assertEqual(result.types, expected_types) @data( # Test combination types of arrays and structs. dict(query='SELECT ([1], "a")', expected_result=((1, ), 'a'), expected_type=BQStructType( [None, None], [BQArray(BQScalarType.INTEGER), BQScalarType.STRING])), dict( query= 'SELECT STRUCT<x ARRAY<INTEGER>, y STRING>(ARRAY<INTEGER>[1], "a")', expected_result=((1, ), 'a'), expected_type=BQStructType( ['x', 'y'], [BQArray(BQScalarType.INTEGER), BQScalarType.STRING])), dict(query='SELECT [(1, "a")]', expected_result=((1, 'a'), ), expected_type=BQArray( BQStructType([None, None], [BQScalarType.INTEGER, BQScalarType.STRING]))), dict(query='SELECT [STRUCT<a INTEGER, b STRING>(1, "a"), (2, "b")]', expected_result=((1, 'a'), (2, 'b')), expected_type=BQArray( BQStructType(['a', 'b'], [BQScalarType.INTEGER, BQScalarType.STRING]))), # Test that an array of structs merges and coerces the types of the # structs. dict( query= 'SELECT [STRUCT<a FLOAT, STRING>(1.0, "a"), STRUCT<INTEGER, b STRING>(2, "b")]', expected_result=((1.0, 'a'), (2.0, 'b')), expected_type=BQArray( BQStructType(['a', 'b'], [BQScalarType.FLOAT, BQScalarType.STRING]))), dict( query= 'SELECT [STRUCT<a INTEGER, b ARRAY<STRING> >(1, ["a"]), (2, ["b", "c"])]', expected_result=((1, ('a', )), (2, ('b', 'c'))), expected_type=BQArray( BQStructType( ['a', 'b'], [BQScalarType.INTEGER, BQArray(BQScalarType.STRING)]))), ) @unpack def test_complex_types(self, query, expected_result, expected_type): # type: (str, Tuple[Optional[int], ...], BQType) -> None table_context = DatasetTableContext({}) node, leftover = select_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[expected_result]]) self.assertEqual(result.types, [expected_type]) @data( dict(query='SELECT ARRAY_AGG(a)', expected_result=(1, 1, 2, None)), dict(query='SELECT ARRAY_AGG(a RESPECT NULLS)', expected_result=(1, 1, 2, None)), dict(query='SELECT ARRAY_AGG(DISTINCT a)', expected_result=(1, 2, None)), dict(query='SELECT ARRAY_AGG(DISTINCT a RESPECT NULLS)', expected_result=(1, 2, None)), dict(query='SELECT ARRAY_AGG(a IGNORE NULLS)', expected_result=(1, 1, 2)), dict(query='SELECT ARRAY_AGG(DISTINCT a IGNORE NULLS)', expected_result=(1, 2)), ) @unpack def test_array_agg_arguments(self, query, expected_result): # type: (str, Tuple[Optional[int], ...]) -> None table_context = DatasetTableContext({ 'p': { 'd': { 't': TypedDataFrame(pd.DataFrame([[1], [1], [2], [None]], columns=['a']), types=[BQScalarType.INTEGER]) } } }) node, leftover = select_rule(tokenize(query + ' FROM p.d.t')) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[expected_result]]) @data( dict(query='SELECT [1,2,"a"]', error='Cannot implicitly coerce the given types'), dict( query='SELECT STRUCT<INT64>(3.7)', error= 'Struct field 1 has type .*FLOAT which does not coerce to .*INTEGER' ), dict( query='SELECT ARRAY<INT64>[3.7]', error= 'Array specifies type .*INTEGER, incompatible with values of type .*FLOAT' ), dict(query='SELECT ARRAY<INT64>[1,2,"a"]', error='Cannot implicitly coerce the given types'), dict(query='SELECT ARRAY<string>[1,2]', error='Cannot implicitly coerce the given types'), dict(query='SELECT [[1]]', error='Cannot create arrays of arrays'), dict(query='SELECT [(1, 2), (3, 4, 5)]', error='Cannot merge .* number of fields varies'), dict(query='SELECT [STRUCT(1 as a, 2 as b), STRUCT(3 as x, 4 as b)]', error='Cannot merge Structs; field names .* do not match'), # same types in different orders can't merge. dict(query='SELECT [(1, "a"), ("b", 2)]', error='Cannot implicitly coerce the given types'), # same names in different orders can't merge dict(query='SELECT [STRUCT(1 as a, 2 as b), STRUCT(3 as b, 4 as a)]', error='Cannot merge Structs; field names .* do not match'), ) @unpack def test_complex_type_errors(self, query, error): # type: (str, str) -> None node, leftover = select_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, Select) with self.assertRaisesRegexp(ValueError, error): node.get_dataframe(self.small_table_context) @data( # Row number over whole dataset; order is not guaranteed dict(selectors='row_number() over ()', expected_result=[[1], [2], [3], [4]]), dict(selectors='row_number() over (order by a), a', expected_result=[[1, 10], [2, 20], [3, 30], [4, 30]]), dict(selectors='row_number() over (order by a asc), a', expected_result=[[1, 10], [2, 20], [3, 30], [4, 30]]), dict(selectors='row_number() over (order by a desc), a', expected_result=[[4, 10], [3, 20], [2, 30], [1, 30]]), dict(selectors='row_number() over (partition by b order by a), a', expected_result=[[1, 10], [2, 20], [1, 30], [2, 30]]), dict(selectors='sum(a) over (), a', expected_result=[[90, 10], [90, 20], [90, 30], [90, 30]]), dict(selectors='sum(a) over (partition by b), a', expected_result=[[30, 10], [30, 20], [60, 30], [60, 30]]), dict(selectors='count(*) over (), a', expected_result=[[4, 10], [4, 20], [4, 30], [4, 30]]), dict(selectors='count(a) over (), a', expected_result=[[4, 10], [4, 20], [4, 30], [4, 30]]), dict(selectors='count(*) over (partition by b), a', expected_result=[[2, 10], [2, 20], [2, 30], [2, 30]]), dict(selectors='count(a) over (partition by b), a', expected_result=[[2, 10], [2, 20], [2, 30], [2, 30]]), dict(selectors='sum(count(*)) over ()', expected_result=[[4]]), ) @unpack def test_analytic_function(self, selectors, expected_result): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame( [[20, 200], [10, 200], [30, 300], [30, 300]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) tokens = tokenize('select {} from my_table'.format(selectors)) node, leftover = select_rule(tokens) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) # Note: BQ docs say if ORDER BY clause (for the select as a whole) is not present, order of # results is undefined, so we do not assert on the order. six.assertCountEqual(self, result.to_list_of_lists(), expected_result) @data( dict(selectors='sum(count(*)) over (), count(*)', expected_result=[[5, 2], [5, 3]]), ) @unpack def test_analytic_function_with_group_by(self, selectors, expected_result): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame( [[20, 2], [10, 2], [30, 3], [31, 3], [32, 3]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) tokens = tokenize( 'select {} from my_table group by b'.format(selectors)) node, leftover = select_rule(tokens) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) # Note: BQ docs say if ORDER BY clause (for the select as a whole) is not present, order of # results is undefined, so we do not assert on the order. six.assertCountEqual(self, result.to_list_of_lists(), expected_result) def test_non_aggregate_function_in_group_by(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([['one', '1'], ['two', '1'], ['three', '2'], ['four', '2']], columns=['a', 'b']), types=[BQScalarType.STRING, BQScalarType.INTEGER]) } } }) tokens = tokenize( 'select max(concat(b, "hi")) from my_table group by b') node, leftover = select_rule(tokens) self.assertFalse(leftover) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [['1hi'], ['2hi']]) @data( dict(count='COUNT(*)', expected_result=[[2]]), dict(count='COUNT(c)', expected_result=[[2]]), dict(count='COUNT(DISTINCT c)', expected_result=[[1]]), dict(count='COUNT(b)', expected_result=[[2]]), dict(count='COUNT(DISTINCT b)', expected_result=[[2]]), dict(count='COUNT(a)', expected_result=[[1]]), ) @unpack def test_count(self, count, expected_result): # type: (str, List[List[int]]) -> None count_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [None, 4, 3]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) select, leftover = select_rule( tokenize('SELECT {} FROM my_table'.format(count))) self.assertFalse(leftover) assert isinstance(select, Select) dataframe, unused_table_name = select.get_dataframe( count_table_context) self.assertEqual(dataframe.to_list_of_lists(), expected_result) @data(('IS_NULL', [True, False]), ('IS_NOT_NULL', [False, True])) @unpack def test_null_check(self, direction, result): # type: (str, List[bool]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, None], [2, 3]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) context = EvaluationContext(table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) expression = Field(('b', )) null_check = NullCheck(expression, direction) typed_series = null_check.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), result) @data(('IN', [True, False]), ('NOT_IN', [False, True])) @unpack def test_in_check(self, direction, result): # type: (str, List[bool]) -> None expression = Field(('a', )) elements = (Value(1, type_=BQScalarType.INTEGER), Value(3, type_=BQScalarType.INTEGER)) in_check = InCheck(expression, direction, elements) context = EvaluationContext(self.small_table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) typed_series = in_check.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), result) @data((True, 0), (False, 1)) @unpack def test_if_empty_context(self, condition_bool, result): # type: (bool, int) -> None condition = Value(condition_bool, BQScalarType.BOOLEAN) then = Value(0, BQScalarType.INTEGER) else_ = Value(1, BQScalarType.INTEGER) # IF [condition] THEN 0 ELSE 1 if_expression = If(condition, then, else_) typed_series = if_expression.evaluate(EMPTY_CONTEXT) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [result]) def test_if(self): condition = BinaryExpression(Field(('a', )), '>', Value(1, BQScalarType.INTEGER)) then = Value('yes', BQScalarType.STRING) else_ = Value('no', BQScalarType.STRING) # IF a > 1 THEN "yes" ELSE "no" if_expression = If(condition, then, else_) context = EvaluationContext(self.small_table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) typed_series = if_expression.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), ['no', 'yes']) def test_if_different_types(self): condition = Value(True, BQScalarType.BOOLEAN) then = Value('yes', BQScalarType.STRING) else_ = Value(1, BQScalarType.INTEGER) if_expression = If(condition, then, else_) error = (r"Cannot implicitly coerce the given types: " r"\(BQScalarType.STRING, BQScalarType.INTEGER\)") with self.assertRaisesRegexp(ValueError, error): if_expression.evaluate(EMPTY_CONTEXT) def test_if_error(self): condition = Value(5, BQScalarType.INTEGER) then = Value(0, BQScalarType.INTEGER) else_ = Value(1, BQScalarType.INTEGER) if_expression = If(condition, then, else_) error = escape("IF condition isn't boolean! Found: {}".format( str(condition.evaluate(EMPTY_CONTEXT)))) with self.assertRaisesRegexp(ValueError, error): if_expression.evaluate(EMPTY_CONTEXT) def test_not(self): expression = Value(True, BQScalarType.BOOLEAN) not_expression = Not(expression) typed_series = not_expression.evaluate(EMPTY_CONTEXT) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [False]) def test_not_type_error(self): expression = Value(5, BQScalarType.INTEGER) not_expression = Not(expression) with self.assertRaisesRegexp(ValueError, ""): not_expression.evaluate(EMPTY_CONTEXT) @data( (1, BQScalarType.INTEGER, -1), (1.0, BQScalarType.FLOAT, -1.0), ) @unpack def test_unary_negation(self, initial_value, value_type, result_value): # type: (Any, BQScalarType, Any) -> None expression = Value(initial_value, value_type) negation = UnaryNegation(expression) typed_series = negation.evaluate(EMPTY_CONTEXT) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), [result_value]) @data( ("abc", BQScalarType.STRING), (True, BQScalarType.BOOLEAN), ) @unpack def test_unary_negation_error(self, value, value_type): # type: (Any, BQScalarType) -> None expression = Value(value, value_type) negation = UnaryNegation(expression) error = ( "UnaryNegation expression supports only integers and floats, got: {}" .format(value_type)) with self.assertRaisesRegexp(TypeError, error): negation.evaluate(EMPTY_CONTEXT) @data( dict(comparand=Field(('a', )), whens=[(Value(1, BQScalarType.INTEGER), Value("one", BQScalarType.STRING)), (Value(2, BQScalarType.INTEGER), Value("two", BQScalarType.STRING))], else_=Value("other", BQScalarType.STRING), result=["one", "two"]), dict(comparand=Field(('a', )), whens=[(Value(1, BQScalarType.INTEGER), Value("one", BQScalarType.STRING))], else_=Value("other", BQScalarType.STRING), result=["one", "other"]), dict(comparand=EMPTY_NODE, whens=[(Value(True, BQScalarType.BOOLEAN), Value("yes", BQScalarType.STRING)), (Value(False, BQScalarType.BOOLEAN), Value("no", BQScalarType.STRING))], else_=EMPTY_NODE, result=["yes", "yes"]), dict(comparand=Field(('a', )), whens=[(Value(1, BQScalarType.INTEGER), Value("one", BQScalarType.STRING))], else_=EMPTY_NODE, result=["one", None]), ) @unpack def test_case_with_comparand( self, comparand, # type: Union[_EmptyNode, EvaluatableNode] whens, # type: List[Tuple[AbstractSyntaxTreeNode, EvaluatableNode]] else_, # type: EvaluatableNode result # type: List[str] ): # type: (...) -> None case = Case(comparand, whens, else_) context = EvaluationContext(self.small_table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) typed_series = case.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), result)
def test_invalid_value(self, value, type_): # type: (Optional[LiteralType], Optional[BQScalarType]) -> None '''Check that None is only allowed as both value and type_ or neither.''' with self.assertRaises(ValueError): Value(value, type_)