def test_data_source_join_on_field_comparison(self, condition, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 9], [2, 8], [2, 1]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['c', 'd']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) data_source_node, leftover = data_source( tokenize( 'my_project.my_dataset.my_table join my_project.my_dataset.my_table2 on {}' .format(condition))) self.assertFalse(leftover) assert isinstance(data_source_node, DataSource) context = data_source_node.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), expected_result) self.assertEqual( list(context.table.dataframe), ['my_table.a', 'my_table.b', 'my_table2.c', 'my_table2.d'])
def test_complex_type_errors(self, query, error): # type: (str, str) -> None node, leftover = select_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, Select) with self.assertRaisesRegexp(ValueError, error): node.get_dataframe(self.small_table_context)
def test_data_source_join_on_arbitrary_bool( self, join_type, # type: Union[_EmptyNode, str] result # type: List[List[int]] ): # type: (...) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']), types=[BQScalarType.INTEGER]), 'my_table2': TypedDataFrame(pd.DataFrame([[2], [0]], columns=['b']), types=[BQScalarType.INTEGER]) } } }) tokens = tokenize( 'my_table {} my_table2 ON MOD(a + b, 3) = 0'.format(join_type)) data_source_node, leftover = apply_rule(data_source, tokens) self.assertFalse(leftover) assert isinstance(data_source_node, DataSource) context = data_source_node.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), result)
def test_exists_reference_outer(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [4]], columns=['a']), types=[BQScalarType.INTEGER]), 'my_table2': TypedDataFrame(pd.DataFrame([[4], [2]], columns=['b']), types=[BQScalarType.INTEGER]), } } }) select_query = "select a from `my_project.my_dataset.my_table` where " \ "my_table.a = my_table2.b" select_node, leftover = apply_rule(select_rule, tokenize(select_query)) self.assertFalse(leftover) exists = Exists(select_node) context = EvaluationContext(table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table2')), EMPTY_NODE) dataframe = exists.evaluate(context) self.assertEqual(list(dataframe.series), [True, False])
def test_match_order(self): """Test tokenization of operators that share characters.""" self.assertEqual( tokenize("SELECT f1 <> 3, f2 << 2, f3 < 5 AND f3 > -1 FROM t"), [ 'SELECT', 'f1', '<>', '3', ',', 'f2', '<<', '2', ',', 'f3', '<', '5', 'AND', 'f3', '>', '-', '1', 'FROM', 't' ])
def test_negatives(self): """Test negative and positive numbers.""" self.assertEqual( tokenize("SELECT -1.23e1, -1e1, -.23e2, 4.0e-7, +5, +3. FROM t"), [ 'SELECT', '-', '1.23e1', ',', '-', '1e1', ',', '-', '.23e2', ',', '4.0e-7', ',', '+', '5', ',', '+', '3.', 'FROM', 't' ])
def test_bigquery_statement(self, statement, type_): # type: (str, type) -> None tree, leftover = apply_rule(bigquery_statement, tokenize(statement)) self.assertFalse(leftover) assert isinstance(tree, tuple) node, unused_semicolon = tree self.assertIsInstance(node, type_)
def test_data_source_joins( self, join_type, # type: Union[_EmptyNode, str] table1, # type: List[List[int]] table2, # type: List[List[int]] result # type: List[List[int]] ): # type: (...) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame(table1, columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame(table2, columns=['a', 'c']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) tokens = tokenize('my_table {} my_table2 {}'.format( join_type, 'USING (a)' if join_type not in (',', 'CROSS JOIN') else '')) data_source_node, leftover = apply_rule(data_source, tokens) self.assertFalse(leftover) assert isinstance(data_source_node, DataSource) context = data_source_node.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), result) self.assertEqual( list(context.table.dataframe), ['my_table.a', 'my_table.b', 'my_table2.a', 'my_table2.c'])
def test_select_star(self, select, expected_result): # type: (str, List[List[int]]) -> None group_table_context = DatasetTableContext({ 'p': { 'd': { 'table1': TypedDataFrame( pd.DataFrame([[2, 8, 4], [6, 3, 0], [12, 10, 1]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), 'table2': TypedDataFrame( pd.DataFrame([[2, 7, 3], [6, 2, -1], [12, 9, 0]], columns=['a', 'd', 'e']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe( group_table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result)
def test_create_table_with_select_mismatched_types(self, query, error): # type: (str, str) -> None node, leftover = apply_rule(statement_rule, tokenize(query)) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) assert isinstance(node, Statement) with self.assertRaisesRegexp(ValueError, error): node.execute(table_context)
def test_with_clause_error(self, query_expression, error): # type: (str, str) -> None query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) self.assertFalse(leftover) assert isinstance(query_expression_node, QueryExpression) with self.assertRaisesRegexp(ValueError, error): query_expression_node.get_dataframe(self.table_context)
def test_struct_field_and_constant(self, query, expected_result, expected_types): # type: (str, List[List[Tuple[Optional[int], ...]]], Sequence[BQStructType]) -> None node, leftover = select_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(self.small_table_context) self.assertEqual(result.to_list_of_lists(), expected_result) self.assertEqual(result.types, expected_types)
def test_with_clause(self, query_expression, expected_result): # type: (str, List[List[int]]) -> None query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) self.assertFalse(leftover) assert isinstance(query_expression_node, QueryExpression) dataframe, _ = query_expression_node.get_dataframe(self.table_context) self.assertEqual(dataframe.to_list_of_lists(), expected_result)
def test_unnest_error(self, query, expected_error): # type: (str, str) -> None node, leftover = query_expression_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, QueryExpression) with self.assertRaisesRegexp(ValueError, expected_error): node.get_dataframe(TableContext())
def test_like_order_by(self): """Test tokenization of operators like WHERE, LIKE, ORDER BY.""" self.assertEqual( tokenize( "SELECT * FROM mytable WHERE field1 LIKE 'a%' ORDER BY field2" ), [ 'SELECT', '*', 'FROM', 'mytable', 'WHERE', 'field1', 'LIKE', "'a%'", 'ORDER', 'BY', 'field2' ])
def test_complex_types(self, query, expected_result, expected_type): # type: (str, Tuple[Optional[int], ...], BQType) -> None table_context = DatasetTableContext({}) node, leftover = select_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[expected_result]]) self.assertEqual(result.types, [expected_type])
def test_create_table_with_select(self, statement, columns): # type: (str, List[str]) -> None node, leftover = apply_rule(statement_rule, tokenize(statement)) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) assert isinstance(node, Statement) result = node.execute(table_context) self.assertEqual(result.path, ('project', 'dataset', 'table')) table, unused_name = table_context.lookup(result.path) self.assertEqual(list(table.dataframe.columns), columns) self.assertEqual(table.types, [BQScalarType.INTEGER, BQScalarType.STRING])
def test_unnest(self, query, result, result_columns): # type: (str, List[List[Any]], List[str]) -> None node, leftover = query_expression_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, QueryExpression) dataframe, _ = node.get_dataframe(TableContext()) self.assertEqual(dataframe.to_list_of_lists(), result) self.assertEqual(list(dataframe.dataframe.columns), result_columns)
def test_scalar_expressions(self, expression, expected_result): # type: (str, Any) -> None tokens = tokenize(expression) ast, leftover = apply_rule(expression_rule, tokens) self.assertFalse(leftover, 'leftover {}'.format(leftover)) assert isinstance(ast, EvaluatableNode) typed_series = ast.evaluate(EMPTY_CONTEXT) assert isinstance(typed_series, TypedSeries) self.assertEqual(typed_series.to_list(), [expected_result])
def test_array_agg_arguments(self, query, expected_result): # type: (str, Tuple[Optional[int], ...]) -> None table_context = DatasetTableContext( {'p': {'d': {'t': TypedDataFrame(pd.DataFrame([[1], [1], [2], [None]], columns=['a']), types=[BQScalarType.INTEGER])}}}) node, leftover = select_rule(tokenize(query + ' FROM p.d.t')) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[expected_result]])
def test_non_aggregate_function_in_group_by(self): table_context = DatasetTableContext( {'my_project': {'my_dataset': {'my_table': TypedDataFrame( pd.DataFrame([['one', '1'], ['two', '1'], ['three', '2'], ['four', '2']], columns=['a', 'b']), types=[BQScalarType.STRING, BQScalarType.INTEGER])}}}) tokens = tokenize('select max(concat(b, "hi")) from my_table group by b') node, leftover = select_rule(tokens) self.assertFalse(leftover) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [['1hi'], ['2hi']])
def test_analytic_function_with_group_by(self, selectors, expected_result): table_context = DatasetTableContext( {'my_project': {'my_dataset': {'my_table': TypedDataFrame( pd.DataFrame([[20, 2], [10, 2], [30, 3], [31, 3], [32, 3]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER])}}}) tokens = tokenize('select {} from my_table group by b'.format(selectors)) node, leftover = select_rule(tokens) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) # Note: BQ docs say if ORDER BY clause (for the select as a whole) is not present, order of # results is undefined, so we do not assert on the order. six.assertCountEqual(self, result.to_list_of_lists(), expected_result)
def test_join(self): """Test tokenization of syntax related to JOIN.""" self.assertEqual( tokenize('''SELECT * FROM (SELECT field1, field2 FROM table1) AS t1 JOIN (SELECT field3, field4 FROM table2) AS t2 ON t1.field1=t2.field2 '''), [ 'SELECT', '*', 'FROM', '(', 'SELECT', 'field1', ',', 'field2', 'FROM', 'table1', ')', 'AS', 't1', 'JOIN', '(', 'SELECT', 'field3', ',', 'field4', 'FROM', 'table2', ')', 'AS', 't2', 'ON', 't1', '.', 'field1', '=', 't2', '.', 'field2' ])
def test_aggregate_functions_in_group_by(self, selectors, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext( {'my_project': {'my_dataset': {'my_table': TypedDataFrame( pd.DataFrame([[2, 1], [4, 1], [5, 2], [np.nan, 2]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER])}}}) tokens = tokenize('select {} from my_table group by b'.format(selectors)) node, leftover = select_rule(tokens) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(result.to_list_of_lists(), expected_result)
def test_exists(self, select_query, result): # type: (str, List[bool]) -> None subquery_node, leftover = apply_rule(query_expression, tokenize(select_query)) assert isinstance(subquery_node, QueryExpression) self.assertFalse(leftover) exists = Exists(subquery_node) context = EvaluationContext(self.small_table_context) context.add_table_from_node(TableReference(('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) typed_series = exists.evaluate(context) assert isinstance(typed_series, TypedSeries) self.assertEqual(list(typed_series.series), result)
def test_create_table_already_exists(self): # type: () -> None node, leftover = apply_rule(statement_rule, tokenize( 'CREATE TABLE project.dataset.table (a int64, b string);')) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) original_table = TypedDataFrame(pd.DataFrame([], columns=['x', 'y', 'z']), [BQScalarType.STRING, BQScalarType.INTEGER, BQScalarType.BOOLEAN]) table_context.set(('project', 'dataset', 'table'), original_table) assert isinstance(node, Statement) with self.assertRaisesRegexp(ValueError, 'Already Exists'): node.execute(table_context) return
def test_create_table_if_not_exists_and_it_does(self): # type: () -> None node, leftover = apply_rule(statement_rule, tokenize( 'CREATE TABLE IF NOT EXISTS project.dataset.table (a int64, b string);')) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) original_table = TypedDataFrame(pd.DataFrame([], columns=['x', 'y', 'z']), [BQScalarType.STRING, BQScalarType.INTEGER, BQScalarType.BOOLEAN]) table_context.set(('project', 'dataset', 'table'), original_table) assert isinstance(node, Statement) result = node.execute(table_context) self.assertEqual(result.path, ('project', 'dataset', 'table')) table, unused_name = table_context.lookup(result.path) self.assertIs(table, original_table)
def test_general_tokenization(self): """Test tokenization of numbers, strings, variables, comments, lists.""" self.assertEqual( tokenize( '''SELECT 34.25, .92, 14., 78.23e-24, 12+23* hello as foo_bar0234, "nope", -- first ARRAY<INT64>[1,2] FROM `myproject.mydataset.mytable` -- more stuff WHERE goodbye <= False '''), [ 'SELECT', '34.25', ',', '.92', ',', '14.', ',', '78.23e-24', ',', '12', '+', '23', '*', 'hello', 'as', 'foo_bar0234', ',', '"nope"', ',', 'ARRAY', '<', 'INT64', '>', '[', '1', ',', '2', ']', 'FROM', '`myproject.mydataset.mytable`', 'WHERE', 'goodbye', '<=', 'False' ])
def test_create_table(self, statement, already_exists): # type: (str, bool) -> None node, leftover = apply_rule(statement_rule, tokenize(statement)) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) original_table = TypedDataFrame(pd.DataFrame([], columns=['x', 'y', 'z']), [BQScalarType.STRING, BQScalarType.INTEGER, BQScalarType.BOOLEAN]) if already_exists: table_context.set(('project', 'dataset', 'table'), original_table) assert isinstance(node, Statement) result = node.execute(table_context) self.assertEqual(result.path, ('project', 'dataset', 'table')) table, unused_name = table_context.lookup(result.path) self.assertEqual(list(table.dataframe.columns), ['a', 'b']) self.assertEqual(table.types, [BQScalarType.INTEGER, BQScalarType.STRING])
def test_select_distinct(self, select, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2], [1, 3]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result)