def test_data_source_joins( self, join_type, # type: Union[_EmptyNode, str] table1, # type: List[List[int]] table2, # type: List[List[int]] result # type: List[List[int]] ): # type: (...) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame(table1, columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame(table2, columns=['a', 'c']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) tokens = tokenize('my_table {} my_table2 {}'.format( join_type, 'USING (a)' if join_type not in (',', 'CROSS JOIN') else '')) data_source_node, leftover = apply_rule(data_source, tokens) self.assertFalse(leftover) assert isinstance(data_source_node, DataSource) context = data_source_node.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), result) self.assertEqual( list(context.table.dataframe), ['my_table.a', 'my_table.b', 'my_table2.a', 'my_table2.c'])
def test_select_star(self, select, expected_result): # type: (str, List[List[int]]) -> None group_table_context = DatasetTableContext({ 'p': { 'd': { 'table1': TypedDataFrame( pd.DataFrame([[2, 8, 4], [6, 3, 0], [12, 10, 1]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), 'table2': TypedDataFrame( pd.DataFrame([[2, 7, 3], [6, 2, -1], [12, 9, 0]], columns=['a', 'd', 'e']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe( group_table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result)
def test_data_source_join_on_field_comparison(self, condition, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 9], [2, 8], [2, 1]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['c', 'd']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) data_source_node, leftover = data_source( tokenize( 'my_project.my_dataset.my_table join my_project.my_dataset.my_table2 on {}' .format(condition))) self.assertFalse(leftover) assert isinstance(data_source_node, DataSource) context = data_source_node.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), expected_result) self.assertEqual( list(context.table.dataframe), ['my_table.a', 'my_table.b', 'my_table2.c', 'my_table2.d'])
def test_data_source_join_on_arbitrary_bool( self, join_type, # type: Union[_EmptyNode, str] result # type: List[List[int]] ): # type: (...) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']), types=[BQScalarType.INTEGER]), 'my_table2': TypedDataFrame(pd.DataFrame([[2], [0]], columns=['b']), types=[BQScalarType.INTEGER]) } } }) tokens = tokenize( 'my_table {} my_table2 ON MOD(a + b, 3) = 0'.format(join_type)) data_source_node, leftover = apply_rule(data_source, tokens) self.assertFalse(leftover) assert isinstance(data_source_node, DataSource) context = data_source_node.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), result)
def setUp(self): # type: () -> None self.small_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']), types=[BQScalarType.INTEGER]) } } }) self.large_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 4, 3]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } })
def test_data_source_join_overlapping_fields(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 9]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'd']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) initial_table = (TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) join_type = 'INNER' join_table = (TableReference( ('my_project', 'my_dataset', 'my_table2')), EMPTY_NODE) join_on = EMPTY_NODE joins = [(join_type, join_table, join_on)] data_source = DataSource(initial_table, joins) context = data_source.create_context(table_context) self.assertEqual(context.table.to_list_of_lists(), [[1, 9, 1, 2]]) self.assertEqual( list(context.table.dataframe), ['my_table.a', 'my_table.b', 'my_table2.a', 'my_table2.d'])
def test_exists_reference_outer(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [4]], columns=['a']), types=[BQScalarType.INTEGER]), 'my_table2': TypedDataFrame(pd.DataFrame([[4], [2]], columns=['b']), types=[BQScalarType.INTEGER]), } } }) select_query = "select a from `my_project.my_dataset.my_table` where " \ "my_table.a = my_table2.b" select_node, leftover = apply_rule(select_rule, tokenize(select_query)) self.assertFalse(leftover) exists = Exists(select_node) context = EvaluationContext(table_context) context.add_table_from_node( TableReference(('my_project', 'my_dataset', 'my_table2')), EMPTY_NODE) dataframe = exists.evaluate(context) self.assertEqual(list(dataframe.series), [True, False])
def test_get_typed_dataframe_schema(self): typed_dataframe = TypedDataFrame( pd.DataFrame(columns=['a', 'b']), [BQScalarType.BOOLEAN, BQArray(BQScalarType.FLOAT)]) self.assertEqual(typed_dataframe.to_bq_schema(), [ SchemaField(name='a', field_type='BOOLEAN'), SchemaField(name='b', field_type='FLOAT', mode='REPEATED') ])
def test_join_conditions(self, condition, expected_result): self.datasets['my_project']['my_dataset']['lefty'] = TypedDataFrame( pd.DataFrame([[1., 10.], [2., 20.]], columns=['a', 'b']), types=[BQScalarType.FLOAT, BQScalarType.FLOAT]) self.datasets['my_project']['my_dataset']['righty'] = TypedDataFrame( pd.DataFrame([[1., 100.], [3., 300.]], columns=['a', 'c']), types=[BQScalarType.FLOAT, BQScalarType.FLOAT]) sql_query = ( ('select lefty.a,righty.a,b,c from `my_project.my_dataset.lefty`' 'full outer join `my_project.my_dataset.righty` {}' ).format(condition)) result = execute_query(sql_query, self.datasets) self.assertEqual(result.to_list_of_lists(), expected_result)
def setUp(self): # type: () -> None self.table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2]], columns=['a']), types=[BQScalarType.INTEGER]), 'my_table2': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } })
def test_join_types(self, join_type, expected_result): # Missing join stuff to test. # alias with AS; alias without AS self.datasets['my_project']['my_dataset']['lefty'] = TypedDataFrame( pd.DataFrame([[1., 10.], [2., 20.]], columns=['a', 'b']), types=[BQScalarType.FLOAT, BQScalarType.FLOAT]) self.datasets['my_project']['my_dataset']['righty'] = TypedDataFrame( pd.DataFrame([[1., 100.], [3., 300.]], columns=['a', 'c']), types=[BQScalarType.FLOAT, BQScalarType.FLOAT]) sql_query = (( 'select lefty.a, righty.a, b, c from `my_project.my_dataset.lefty`' '{} join `my_project.my_dataset.righty` on lefty.a=righty.a' ).format(join_type)) result = execute_query(sql_query, self.datasets) self.assertEqual(result.to_list_of_lists(), expected_result)
def test_data_source_join_multiple_joins(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'b1', 'c1']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), 'my_table2': TypedDataFrame(pd.DataFrame([[1, 8, 9], [0, 7, 2]], columns=['a', 'b', 'c2']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), 'my_table3': TypedDataFrame(pd.DataFrame([[3, 4, 5], [6, 7, 8]], columns=['a3', 'b', 'c3']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) initial_table = (TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) join_type = 'FULL' join_table2 = (TableReference( ('my_project', 'my_dataset', 'my_table2')), EMPTY_NODE) join_table3 = (TableReference( ('my_project', 'my_dataset', 'my_table3')), EMPTY_NODE) joins = [(join_type, join_table2, ('a', )), (join_type, join_table3, ('b', ))] data_source = DataSource(initial_table, joins) context = data_source.create_context(table_context) result = [[1, 2, 3, 1, 8, 9, None, None, None], [4, 5, 6, None, None, None, None, None, None], [None, None, None, 0, 7, 2, 6, 7, 8], [None, None, None, None, None, None, 3, 4, 5]] self.assertEqual(context.table.to_list_of_lists(), result)
def test_get_typed_dataframe_as_list_of_lists(self): typed_dataframe = TypedDataFrame( pd.DataFrame( [[ np.bool_(True), (np.float64(1.5), np.float64(2.5), np.float64(3.0)) ], [ np.bool_(False), (np.float64(2.5), np.float64(3.5), np.float64(4.0)) ]], columns=['a', 'b']), [BQScalarType.BOOLEAN, BQArray(BQScalarType.FLOAT)]) self.assertEqual(typed_dataframe.to_list_of_lists(), [[True, (1.5, 2.5, 3.0)], [False, (2.5, 3.5, 4.0)]])
def test_table_reference_multi_dataset(self): # type: () -> None new_table_context = DatasetTableContext({ 'project1': { 'dataset1': { 'table1': TypedDataFrame(pd.DataFrame(), []) }, 'dataset2': { 'table2': TypedDataFrame(pd.DataFrame(), []) } }, }) table_ref = TableReference(('table1', )) expected_error = "Non-fully-qualified table \\('table1',\\) with multiple possible "\ "datasets \\['dataset1', 'dataset2'\\]" with self.assertRaisesRegexp(ValueError, expected_error): table_ref.get_dataframe(new_table_context)
def test_array_agg_arguments(self, query, expected_result): # type: (str, Tuple[Optional[int], ...]) -> None table_context = DatasetTableContext( {'p': {'d': {'t': TypedDataFrame(pd.DataFrame([[1], [1], [2], [None]], columns=['a']), types=[BQScalarType.INTEGER])}}}) node, leftover = select_rule(tokenize(query + ' FROM p.d.t')) self.assertFalse(leftover) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[expected_result]])
def test_non_aggregate_function_in_group_by(self): table_context = DatasetTableContext( {'my_project': {'my_dataset': {'my_table': TypedDataFrame( pd.DataFrame([['one', '1'], ['two', '1'], ['three', '2'], ['four', '2']], columns=['a', 'b']), types=[BQScalarType.STRING, BQScalarType.INTEGER])}}}) tokens = tokenize('select max(concat(b, "hi")) from my_table group by b') node, leftover = select_rule(tokens) self.assertFalse(leftover) result, unused_table_name = node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [['1hi'], ['2hi']])
def test_analytic_function_with_group_by(self, selectors, expected_result): table_context = DatasetTableContext( {'my_project': {'my_dataset': {'my_table': TypedDataFrame( pd.DataFrame([[20, 2], [10, 2], [30, 3], [31, 3], [32, 3]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER])}}}) tokens = tokenize('select {} from my_table group by b'.format(selectors)) node, leftover = select_rule(tokens) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) # Note: BQ docs say if ORDER BY clause (for the select as a whole) is not present, order of # results is undefined, so we do not assert on the order. six.assertCountEqual(self, result.to_list_of_lists(), expected_result)
def test_aggregate_functions_in_group_by(self, selectors, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext( {'my_project': {'my_dataset': {'my_table': TypedDataFrame( pd.DataFrame([[2, 1], [4, 1], [5, 2], [np.nan, 2]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER])}}}) tokens = tokenize('select {} from my_table group by b'.format(selectors)) node, leftover = select_rule(tokens) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(result.to_list_of_lists(), expected_result)
def setUp(self): ten_rows = TypedDataFrame( pd.DataFrame([[i] for i in range(10)], columns=['i']), [BQScalarType.INTEGER]) table1 = TypedDataFrame( pd.DataFrame( [[1, 2, 3], [2, 3, 4], [3, 4, 5]], columns=['a', 'b', 'c']), [BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER]) table2 = TypedDataFrame( pd.DataFrame( [[1, 6, 0], [2, 7, 1], [2, 8, 2]], columns=['a', 'd', 'e']), [BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER]) table3 = TypedDataFrame( pd.DataFrame( [[1, 1, 0], [2, 1, 0], [3, 1, 3]], columns=['a', 'b', 'c']), [BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER]) counts = TypedDataFrame( pd.DataFrame( [ [1], [1], [2], [None] ], columns=['i']), [BQScalarType.INTEGER]) timetable = TypedDataFrame( pd.DataFrame( [[datetime.datetime(2001, 2, 3, 4, 5, 6, 789)]], columns=['t']), [BQScalarType.DATETIME]) self.datasets = { 'my_project': { 'my_dataset': { 'ten_rows': ten_rows, 'table1': table1, 'table2': table2, 'table3': table3, 'counts': counts, 'timetable': timetable, } } }
def test_create_table_already_exists(self): # type: () -> None node, leftover = apply_rule(statement_rule, tokenize( 'CREATE TABLE project.dataset.table (a int64, b string);')) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) original_table = TypedDataFrame(pd.DataFrame([], columns=['x', 'y', 'z']), [BQScalarType.STRING, BQScalarType.INTEGER, BQScalarType.BOOLEAN]) table_context.set(('project', 'dataset', 'table'), original_table) assert isinstance(node, Statement) with self.assertRaisesRegexp(ValueError, 'Already Exists'): node.execute(table_context) return
def test_data_source_join_multiple_columns(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1, 2, 3], [1, 5, 6]], columns=['a', 'b', 'c']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]), 'my_table2': TypedDataFrame(pd.DataFrame([[1, 2, 7], [3, 2, 8]], columns=['a', 'b', 'd']), types=[ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) initial_table = (TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE) join_type = 'FULL' join_table = (TableReference( ('my_project', 'my_dataset', 'my_table2')), EMPTY_NODE) join_on = ('a', 'b') joins = [(join_type, join_table, join_on)] data_source = DataSource(initial_table, joins) context = data_source.create_context(table_context) result = [[1, 2, 3, 1, 2, 7], [1, 5, 6, None, None, None], [None, None, None, 3, 2, 8]] self.assertEqual(context.table.to_list_of_lists(), result)
def test_create_table_if_not_exists_and_it_does(self): # type: () -> None node, leftover = apply_rule(statement_rule, tokenize( 'CREATE TABLE IF NOT EXISTS project.dataset.table (a int64, b string);')) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) original_table = TypedDataFrame(pd.DataFrame([], columns=['x', 'y', 'z']), [BQScalarType.STRING, BQScalarType.INTEGER, BQScalarType.BOOLEAN]) table_context.set(('project', 'dataset', 'table'), original_table) assert isinstance(node, Statement) result = node.execute(table_context) self.assertEqual(result.path, ('project', 'dataset', 'table')) table, unused_name = table_context.lookup(result.path) self.assertIs(table, original_table)
def test_create_table(self, statement, already_exists): # type: (str, bool) -> None node, leftover = apply_rule(statement_rule, tokenize(statement)) self.assertFalse(leftover) table_context = DatasetTableContext({'project': {'dataset': {}}}) original_table = TypedDataFrame(pd.DataFrame([], columns=['x', 'y', 'z']), [BQScalarType.STRING, BQScalarType.INTEGER, BQScalarType.BOOLEAN]) if already_exists: table_context.set(('project', 'dataset', 'table'), original_table) assert isinstance(node, Statement) result = node.execute(table_context) self.assertEqual(result.path, ('project', 'dataset', 'table')) table, unused_name = table_context.lookup(result.path) self.assertEqual(list(table.dataframe.columns), ['a', 'b']) self.assertEqual(table.types, [BQScalarType.INTEGER, BQScalarType.STRING])
def test_count(self, count, expected_result): # type: (str, List[List[int]]) -> None count_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2, 3], [None, 4, 3]], columns=['a', 'b', 'c']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER] ) } } }) select, leftover = select_rule(tokenize('SELECT {} FROM my_table'.format(count))) self.assertFalse(leftover) assert isinstance(select, Select) dataframe, unused_table_name = select.get_dataframe(count_table_context) self.assertEqual(dataframe.to_list_of_lists(), expected_result)
def test_select_distinct(self, select, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2], [1, 3]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) select_node, leftover = select_rule(tokenize(select)) assert isinstance(select_node, Select) dataframe, unused_table_name = select_node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result)
def test_exists_index(self): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'bool_table': TypedDataFrame(pd.DataFrame([[True], [False]], columns=['a']), types=[BQScalarType.BOOLEAN]) } } }) select_query = 'select a = exists(select 1) from `my_project.my_dataset.bool_table`' select_node, leftover = apply_rule(select_rule, tokenize(select_query)) self.assertFalse(leftover) result, unused_table_name = select_node.get_dataframe(table_context) self.assertEqual(result.to_list_of_lists(), [[True], [False]])
def test_aggregate_functions_in_expressions(self, query, expected_result): # type: (str, List[List[int]]) -> None table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame(pd.DataFrame([[1], [2], [3]], columns=['a']), types=[BQScalarType.INTEGER]) } } }) node, leftover = select_rule(tokenize(query)) assert isinstance(node, Select) result, unused_table_name = node.get_dataframe(table_context) self.assertFalse(leftover) self.assertEqual(result.to_list_of_lists(), expected_result)
def test_query_expression_set_operation_error(self, query_expression, error): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']), [ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) self.assertFalse(leftover) with self.assertRaisesRegexp(ValueError, error): query_expression_node.get_dataframe(table_context)
def test_query_expression_set_operation(self, query_expression, expected_result): table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']), [ BQScalarType.INTEGER, BQScalarType.INTEGER, BQScalarType.INTEGER ]) } } }) query_expression_node, leftover = query_expression_rule( tokenize(query_expression)) dataframe, unused_table_name = query_expression_node.get_dataframe( table_context) self.assertFalse(leftover) self.assertEqual(dataframe.to_list_of_lists(), expected_result)
def test_select_where(self, where): # type: (EvaluatableNode) -> None where_table_context = DatasetTableContext({ 'my_project': { 'my_dataset': { 'my_table': TypedDataFrame( pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b']), types=[BQScalarType.INTEGER, BQScalarType.INTEGER]) } } }) fields = [Selector(Field(('a', )), EMPTY_NODE)] from_ = DataSource((TableReference( ('my_project', 'my_dataset', 'my_table')), EMPTY_NODE), []) select = Select(EMPTY_NODE, fields, from_, where, EMPTY_NODE, EMPTY_NODE) dataframe, table_name = select.get_dataframe(where_table_context) self.assertEqual(dataframe.to_list_of_lists(), [[3]])