def setUp(self): # type: () -> None self.small_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']), types=[BQScalarType.INTEGER]) } } }) self.large_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 4, 3]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } })
def test_data_source_joins( self, join_type, # type: Union[_EmptyNode, str] table1, # type: List[List[int]] table2, # type: List[List[int]] result # type: List[List[int]] ): # type: (...) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame(table1, columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame(table2, columns=['a', 'c']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) tokens = tokenize('my_table {} my_table2 {}'.format( join_type, 'USING (a)' if join_type not in (',', 'CROSS JOIN') else '')) data_source_node, leftover = apply_rule(data_source, tokens) self.assertFalse(leftover) assert isinstance(data_source_node, DataSource) context = data_source_node.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), result) self.assertEqual( list(context.table.dataframe), ['my_table.a', 'my_table.b', 'my_table2.a', 'my_table2.c'])
def test_exists_reference_outer(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [4]], columns=['a']), types=[BQScalarType.INTEGER]), 'my_table2': TypedDataFrame(pd.DataFrame([[4], [2]], columns=['b']), types=[BQScalarType.INTEGER]), } } }) select_query = "select a from `my_project.my_dataset.my_table` where " \ "my_table.a = my_table2.b" select_node, leftover = apply_rule(select_rule, tokenize(select_query)) self.assertFalse(leftover) exists = Exists(select_node) context = EvaluationContext(table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table2')), EMPTY_NODE) dataframe = exists.evaluate(context) self.assertEqual(list(dataframe.series), [True, False])
def test_data_source_join_overlapping_fields(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 9]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'd']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) initial_table = (TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) join_type = 'INNER' join_table = (TableReference( ('my_project', 'my_dataset', 'my_table2')), EMPTY_NODE) join_on = EMPTY_NODE joins = [(join_type, join_table, join_on)] data_source = DataSource(initial_table, joins) context = data_source.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), [[1, 9, 1, 2]]) self.assertEqual( list(context.table.dataframe), ['my_table.a', 'my_table.b', 'my_table2.a', 'my_table2.d'])
def test_data_source_join_on_field_comparison(self, condition, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 9], [2, 8], [2, 1]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['c', 'd']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) data_source_node, leftover = data_source( tokenize( 'my_project.my_dataset.my_table join my_project.my_dataset.my_table2 on {}' .format(condition))) self.assertFalse(leftover) assert isinstance(data_source_node, DataSource) context = data_source_node.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), expected_result) self.assertEqual( list(context.table.dataframe), ['my_table.a', 'my_table.b', 'my_table2.c', 'my_table2.d'])
def test_select_star(self, select, expected_result): # type: (str, List[List[int]]) -> None group_table_context = DatasetTableContext({ 'p': { 'd': { 'table1': TypedDataFrame( pd.DataFrame([[2, 8, 4], [6, 3, 0], [12, 10, 1]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), 'table2': TypedDataFrame( pd.DataFrame([[2, 7, 3], [6, 2, -1], [12, 9, 0]], columns=['a', 'd', 'e']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe( group_table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result)
def test_data_source_join_on_arbitrary_bool( self, join_type, # type: Union[_EmptyNode, str] result # type: List[List[int]] ): # type: (...) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']), types=[BQScalarType.INTEGER]), 'my_table2': TypedDataFrame(pd.DataFrame([[2], [0]], columns=['b']), types=[BQScalarType.INTEGER]) } } }) tokens = tokenize( 'my_table {} my_table2 ON MOD(a + b, 3) = 0'.format(join_type)) data_source_node, leftover = apply_rule(data_source, tokens) self.assertFalse(leftover) assert isinstance(data_source_node, DataSource) context = data_source_node.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), result)
def test_create_table_with_select_mismatched_types(self, query, error): # type: (str, str) -> None node, leftover = apply_rule(statement_rule, tokenize(query)) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) assert isinstance(node, Statement) with self.assertRaisesRegexp(ValueError, error): node.execute(table_context)
def test_complex_types(self, query, expected_result, expected_type): # type: (str, Tuple[Optional[int], ...], BQType) -> None table_context = DatasetTableContext({}) node, leftover = select_rule(tokenize(query)) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[expected_result]]) self.assertEqual(result.types, [expected_type])
def test_create_table_with_select(self, statement, columns): # type: (str, List[str]) -> None node, leftover = apply_rule(statement_rule, tokenize(statement)) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) assert isinstance(node, Statement) result = node.execute(table_context) self.assertEqual(result.path, ('project', 'dataset', 'table')) table, unused_name = table_context.lookup(result.path) self.assertEqual(list(table.dataframe.columns), columns) self.assertEqual(table.types, [BQScalarType.INTEGER, BQScalarType.STRING])
def setUp(self): # type: () -> None self.table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2], [3]], columns=['a']), types=[BQScalarType.INTEGER]) } } })
def test_create_table_already_exists(self): # type: () -> None node, leftover = apply_rule(statement_rule, tokenize( 'CREATE TABLE project.dataset.table (a int64, b string);')) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) original_table = TypedDataFrame(pd.DataFrame([], columns=['x', 'y', 'z']), [BQScalarType.STRING, BQScalarType.INTEGER, BQScalarType.BOOLEAN]) table_context.set(('project', 'dataset', 'table'), original_table) assert isinstance(node, Statement) with self.assertRaisesRegexp(ValueError, 'Already Exists'): node.execute(table_context) return
def test_create_table_if_not_exists_and_it_does(self): # type: () -> None node, leftover = apply_rule(statement_rule, tokenize( 'CREATE TABLE IF NOT EXISTS project.dataset.table (a int64, b string);')) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) original_table = TypedDataFrame(pd.DataFrame([], columns=['x', 'y', 'z']), [BQScalarType.STRING, BQScalarType.INTEGER, BQScalarType.BOOLEAN]) table_context.set(('project', 'dataset', 'table'), original_table) assert isinstance(node, Statement) result = node.execute(table_context) self.assertEqual(result.path, ('project', 'dataset', 'table')) table, unused_name = table_context.lookup(result.path) self.assertIs(table, original_table)
def test_data_source_join_multiple_joins(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'b1', 'c1']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), 'my_table2': TypedDataFrame(pd.DataFrame([[1, 8, 9], [0, 7, 2]], columns=['a', 'b', 'c2']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), 'my_table3': TypedDataFrame(pd.DataFrame([[3, 4, 5], [6, 7, 8]], columns=['a3', 'b', 'c3']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) initial_table = (TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) join_type = 'FULL' join_table2 = (TableReference( ('my_project', 'my_dataset', 'my_table2')), EMPTY_NODE) join_table3 = (TableReference( ('my_project', 'my_dataset', 'my_table3')), EMPTY_NODE) joins = [(join_type, join_table2, ('a', )), (join_type, join_table3, ('b', ))] data_source = DataSource(initial_table, joins) context = data_source.create_context(table_context) result = [[1, 2, 3, 1, 8, 9, None, None, None], [4, 5, 6, None, None, None, None, None, None], [None, None, None, 0, 7, 2, 6, 7, 8], [None, None, None, None, None, None, 3, 4, 5]] self.assertEqual(context.table.to_list_of_lists(), result)
def test_create_table(self, statement, already_exists): # type: (str, bool) -> None node, leftover = apply_rule(statement_rule, tokenize(statement)) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) original_table = TypedDataFrame(pd.DataFrame([], columns=['x', 'y', 'z']), [BQScalarType.STRING, BQScalarType.INTEGER, BQScalarType.BOOLEAN]) if already_exists: table_context.set(('project', 'dataset', 'table'), original_table) assert isinstance(node, Statement) result = node.execute(table_context) self.assertEqual(result.path, ('project', 'dataset', 'table')) table, unused_name = table_context.lookup(result.path) self.assertEqual(list(table.dataframe.columns), ['a', 'b']) self.assertEqual(table.types, [BQScalarType.INTEGER, BQScalarType.STRING])
def test_select_distinct(self, select, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2], [1, 3]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result)
def test_current_timestamp(self): # type: () -> None node, leftover = apply_rule( query_expression, tokenize( 'select current_timestamp(), a from unnest([struct(1 as a), struct(2), struct(3)])' )) assert isinstance(node, QueryExpression) self.assertFalse(leftover) result, _ = node.get_dataframe(DatasetTableContext({})) table = cast(List[List[datetime.datetime]], result.to_list_of_lists()) self.assertEqual(len(table), 3) # CURRENT_TIMESTAMP() returns a very recent timestamp self.assertLess((datetime.datetime.now() - table[0][0]).seconds, 2) # All rows have the same timestamp value. self.assertEqual(table[0][0], table[1][0]) self.assertEqual(table[0][0], table[2][0])
def test_table_reference_multi_dataset(self): # type: () -> None new_table_context = DatasetTableContext({ 'project1': { 'dataset1': { 'table1': TypedDataFrame(pd.DataFrame(), []) }, 'dataset2': { 'table2': TypedDataFrame(pd.DataFrame(), []) } }, }) table_ref = TableReference(('table1', )) expected_error = "Non-fully-qualified table \\('table1',\\) with multiple possible "\ "datasets \\['dataset1', 'dataset2'\\]" with self.assertRaisesRegexp(ValueError, expected_error): table_ref.get_dataframe(new_table_context)
def test_aggregate_functions_in_expressions(self, query, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2], [3]], columns=['a']), types=[BQScalarType.INTEGER]) } } }) node, leftover = select_rule(tokenize(query)) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(result.to_list_of_lists(), expected_result)
def test_array_agg_arguments(self, query, expected_result): # type: (str, Tuple[Optional[int], ...]) -> None table_context = DatasetTableContext({ 'p': { 'd': { 't': TypedDataFrame(pd.DataFrame([[1], [1], [2], [None]], columns=['a']), types=[BQScalarType.INTEGER]) } } }) node, leftover = select_rule(tokenize(query + ' FROM p.d.t')) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[expected_result]])
def test_exists_index(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'bool_table': TypedDataFrame(pd.DataFrame([[True], [False]], columns=['a']), types=[BQScalarType.BOOLEAN]) } } }) select_query = 'select a = exists(select 1) from `my_project.my_dataset.bool_table`' select_node, leftover = apply_rule(select_rule, tokenize(select_query)) self.assertFalse(leftover) result, unused_table_name = select_node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[True], [False]])
def test_query_expression_set_operation_error(self, query_expression, error): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']), [ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) self.assertFalse(leftover) with self.assertRaisesRegexp(ValueError, error): query_expression_node.get_dataframe(table_context)
def test_analytic_function(self, selectors, expected_result): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame( [[20, 200], [10, 200], [30, 300], [30, 300]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) tokens = tokenize('select {} from my_table'.format(selectors)) node, leftover = select_rule(tokens) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) # Note: BQ docs say if ORDER BY clause (for the select as a whole) is not present, order of # results is undefined, so we do not assert on the order. six.assertCountEqual(self, result.to_list_of_lists(), expected_result)
def test_non_aggregate_function_in_group_by(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([['one', '1'], ['two', '1'], ['three', '2'], ['four', '2']], columns=['a', 'b']), types=[BQScalarType.STRING, BQScalarType.INTEGER]) } } }) tokens = tokenize( 'select max(concat(b, "hi")) from my_table group by b') node, leftover = select_rule(tokens) self.assertFalse(leftover) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [['1hi'], ['2hi']])
def test_query_expression_set_operation(self, query_expression, expected_result): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']), [ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) dataframe, unused_table_name = query_expression_node.get_dataframe( table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result)
def test_aggregate_functions_in_group_by(self, selectors, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[2, 1], [4, 1], [5, 2], [np.nan, 2]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) tokens = tokenize( 'select {} from my_table group by b'.format(selectors)) node, leftover = select_rule(tokens) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(result.to_list_of_lists(), expected_result)
def test_select_where(self, where): # type: (EvaluatableNode) -> None where_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) fields = [Selector(Field(('a', )), EMPTY_NODE)] from_ = DataSource((TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), []) select = Select(EMPTY_NODE, fields, from_, where, EMPTY_NODE, EMPTY_NODE) dataframe, table_name = select.get_dataframe(where_table_context) self.assertEqual(dataframe.to_list_of_lists(), [[3]])
def setUp(self): # type: () -> None self.table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']), types=[BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]), 'my_table3': TypedDataFrame(pd.DataFrame([[5], [6]], columns=['c']), types=[BQScalarType.INTEGER]), 'my_table4': TypedDataFrame(pd.DataFrame([[7], [8]], columns=['c']), types=[BQScalarType.INTEGER]), } } })
def test_select_group_by_error(self, select): # type: (str) -> None group_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 3, 3]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) self.assertFalse(leftover) with self.assertRaisesRegexp(ValueError, "not aggregated or grouped by"): select_node.get_dataframe(group_table_context)
def test_constructed_column_has_correct_index(self, query, expected_result): # type: (str, List[List[int]]) -> None '''Checks that manually constructed columns have the same index as the data. A manually constructed column will usually have an index 0, 1, 2, ... (e.g. pd.Series(['a', 'b', 'c']) has index 0, 1, 2). The data may not; filtering, sorting or other changes might result in an index of different numbers. If one column's index doesn't match the index of other columns, it can't be compared or joined with them properly. ''' table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2, -1], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) # Skip the first row of the table, so that the index of the table that # the test queries operate on is [1, 2]; this makes sure that the index is # different from the default index you would get for a two-row column, # which would be [0, 1], to test that expressions are not incorrectly # using that default index. node, leftover = select_rule( tokenize(query + ' from (select * from my_table where c > 0)')) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(result.to_list_of_lists(), expected_result) self.assertEqual(list(result.dataframe.index), [1, 2])