def test_populate_node_family_full_parentage_complex_dependencies(self) -> None: view_1 = BigQueryView( dataset_id="dataset_1", view_id="table_1", description="table_1 description", view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`", ) view_2 = BigQueryView( dataset_id="dataset_2", view_id="table_2", description="table_2 description", view_query_template="SELECT * FROM `{project_id}.dataset_1.table_1`", ) view_3 = BigQueryView( dataset_id="dataset_3", view_id="table_3", description="table_3 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_1.table_1` JOIN `{project_id}.dataset_2.table_2` USING (col)""", ) view_4 = BigQueryView( dataset_id="dataset_4", view_id="table_4", description="table_4 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_2.table_2` JOIN `{project_id}.dataset_3.table_3` USING (col)""", ) dag_walker = BigQueryViewDagWalker([view_1, view_2, view_3, view_4]) start_node = dag_walker.node_for_view(view_4) dag_walker.populate_node_family_for_node( node=start_node, view_source_table_datasets={"source_dataset"} ) expected_parent_nodes = { DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table" ) ), DagKey.for_view(view_1), DagKey.for_view(view_2), DagKey.for_view(view_3), } self.assertEqual(expected_parent_nodes, start_node.node_family.full_parentage)
def test_dag_returns_parent_results(self) -> None: walker = BigQueryViewDagWalker(self.all_views) def process_check_parents( _view: BigQueryView, parent_results: Dict[BigQueryView, int] ) -> int: if not parent_results: return 1 return max(parent_results.values()) + 1 result = walker.process_dag(process_check_parents) self.assertEqual(len(self.all_views), len(result)) max_depth = 0 max_depth_view = None for view, depth in result.items(): if depth > max_depth: max_depth = depth max_depth_view = view if not max_depth_view: self.fail("Found no max_depth_view") max_depth_node = walker.node_for_view(max_depth_view) self.assertEqual(set(), max_depth_node.child_keys)
def test_populate_node_family_descendants_dfs_tree_str(self) -> None: view_1 = BigQueryView( dataset_id="dataset_1", view_id="table_1", description="table_1 description", view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`", ) view_2 = BigQueryView( dataset_id="dataset_2", view_id="table_2", description="table_2 description", view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`", ) view_3 = BigQueryView( dataset_id="dataset_3", view_id="table_3", description="table_3 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_1.table_1` JOIN `{project_id}.dataset_2.table_2` USING (col)""", ) view_4 = BigQueryView( dataset_id="dataset_4", view_id="table_4", description="table_4 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_3.table_3`""", ) view_5 = BigQueryView( dataset_id="dataset_5", view_id="table_5", description="table_5 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_3.table_3`""", ) view_6 = BigQueryView( dataset_id="dataset_6", view_id="table_6", description="table_6 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_5.table_5`""", ) dag_walker = BigQueryViewDagWalker( [view_1, view_2, view_3, view_4, view_5, view_6] ) # Top level view node = dag_walker.node_for_view(view_2) dag_walker.populate_node_family_for_node(node=node) expected_tree = """dataset_2.table_2 |--dataset_3.table_3 |----dataset_4.table_4 |----dataset_5.table_5 |------dataset_6.table_6 """ self.assertEqual(expected_tree, node.node_family.child_dfs_tree_str) # Descendants from middle of tree node = dag_walker.node_for_view(view_3) dag_walker.populate_node_family_for_node(node=node) expected_tree = """dataset_3.table_3 |--dataset_4.table_4 |--dataset_5.table_5 |----dataset_6.table_6 """ self.assertEqual(expected_tree, node.node_family.child_dfs_tree_str)
def test_populate_node_family_parentage_dfs_tree_str(self) -> None: view_1 = BigQueryView( dataset_id="dataset_1", view_id="table_1", description="table_1 description", view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`", ) view_2 = BigQueryView( dataset_id="dataset_2", view_id="table_2", description="table_2 description", view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`", ) view_3 = BigQueryView( dataset_id="dataset_3", view_id="table_3", description="table_3 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_1.table_1` JOIN `{project_id}.dataset_2.table_2` USING (col)""", ) view_4 = BigQueryView( dataset_id="dataset_4", view_id="table_4", description="table_4 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_3.table_3`""", ) view_5 = BigQueryView( dataset_id="dataset_5", view_id="table_5", description="table_5 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_3.table_3`""", ) dag_walker = BigQueryViewDagWalker([view_1, view_2, view_3, view_4, view_5]) # Top level view node = dag_walker.node_for_view(view_5) dag_walker.populate_node_family_for_node(node=node) expected_tree = """dataset_5.table_5 |--dataset_3.table_3 |----dataset_2.table_2 |------source_dataset.source_table_2 |----dataset_1.table_1 |------source_dataset.source_table """ self.assertEqual(expected_tree, node.node_family.parent_dfs_tree_str) # Middle of tree node = dag_walker.node_for_view(view_3) dag_walker.populate_node_family_for_node(node=node) expected_tree = """dataset_3.table_3 |--dataset_2.table_2 |----source_dataset.source_table_2 |--dataset_1.table_1 |----source_dataset.source_table """ self.assertEqual(expected_tree, node.node_family.parent_dfs_tree_str) # Skip datasets dag_walker.populate_node_family_for_node( node=node, datasets_to_skip={"dataset_1"}, ) expected_tree = """dataset_3.table_3 |--dataset_2.table_2 |----source_dataset.source_table_2 |--source_dataset.source_table """ self.assertEqual(expected_tree, node.node_family.parent_dfs_tree_str) # Custom formatted def _custom_formatter(dag_key: DagKey) -> str: return f"custom_formatted_{dag_key.dataset_id}_{dag_key.table_id}" dag_walker.populate_node_family_for_node( node=node, custom_node_formatter=_custom_formatter ) expected_tree = """custom_formatted_dataset_3_table_3 |--custom_formatted_dataset_2_table_2 |----custom_formatted_source_dataset_source_table_2 |--custom_formatted_dataset_1_table_1 |----custom_formatted_source_dataset_source_table """ self.assertEqual(expected_tree, node.node_family.parent_dfs_tree_str)
def test_populate_node_family_full_parentage(self) -> None: dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list) # root node start start_node = dag_walker.node_for_view(self.x_shaped_dag_views_list[0]) dag_walker.populate_node_family_for_node( node=start_node, view_source_table_datasets={"source_dataset"} ) self.assertEqual( { DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table" ) ) }, start_node.node_family.full_parentage, ) # start in middle start_node = dag_walker.node_for_view(self.x_shaped_dag_views_list[2]) dag_walker.populate_node_family_for_node( node=start_node, view_source_table_datasets={"source_dataset"} ) expected_parent_nodes = { DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table" ) ), DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table_2" ) ), DagKey.for_view(self.x_shaped_dag_views_list[0]), DagKey.for_view(self.x_shaped_dag_views_list[1]), } self.assertEqual( expected_parent_nodes, start_node.node_family.full_parentage, ) # single start node start_node = dag_walker.node_for_view(self.x_shaped_dag_views_list[3]) dag_walker.populate_node_family_for_node( node=start_node, view_source_table_datasets={"source_dataset"} ) expected_parent_nodes = { DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table" ) ), DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table_2" ) ), DagKey.for_view(self.x_shaped_dag_views_list[0]), DagKey.for_view(self.x_shaped_dag_views_list[1]), DagKey.for_view(self.x_shaped_dag_views_list[2]), } self.assertEqual( expected_parent_nodes, start_node.node_family.full_parentage, ) # multiple start nodes start_nodes = [ start_node, dag_walker.node_for_view(self.x_shaped_dag_views_list[4]), ] parentage_set: Set[DagKey] = set() for node in start_nodes: dag_walker.populate_node_family_for_node( node=node, view_source_table_datasets={"source_dataset"} ) parentage_set = parentage_set.union(node.node_family.full_parentage) self.assertEqual(expected_parent_nodes, parentage_set)