def test_parse_view_multiple_parents(self) -> None: view = BigQueryView( dataset_id="my_dataset", view_id="my_view_id", description="my view description", view_query_template="""SELECT * FROM `{project_id}.some_dataset.some_table` LEFT OUTER JOIN `{project_id}.some_dataset.other_table` USING (some_col); """, ) node = BigQueryViewDagNode(view) node.set_materialized_addresss({}) self.assertEqual( node.parent_keys, { DagKey( view_address=BigQueryAddress( dataset_id="some_dataset", table_id="some_table" ) ), DagKey( view_address=BigQueryAddress( dataset_id="some_dataset", table_id="other_table" ) ), }, )
def test_parse_view_materialized_parent(self) -> None: view = BigQueryView( dataset_id="my_dataset", view_id="my_view_id", description="my view description", view_query_template="SELECT * FROM `{project_id}.some_dataset.some_table_materialized`", ) parent_view = BigQueryView( dataset_id="some_dataset", view_id="some_table", description="my parent view description", view_query_template="SELECT * FROM UNNEST([])", should_materialize=True, ) node = BigQueryViewDagNode(view) if not parent_view.materialized_address: raise ValueError("Null materialized_address for view [{parent_view}]") node.set_materialized_addresss( {parent_view.materialized_address: DagKey.for_view(parent_view)} ) self.assertEqual( node.parent_keys, { DagKey( view_address=BigQueryAddress( dataset_id="some_dataset", table_id="some_table" ) ) }, )
def test_populate_node_family_full_parentage_complex_dependencies(self) -> None: view_1 = BigQueryView( dataset_id="dataset_1", view_id="table_1", description="table_1 description", view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`", ) view_2 = BigQueryView( dataset_id="dataset_2", view_id="table_2", description="table_2 description", view_query_template="SELECT * FROM `{project_id}.dataset_1.table_1`", ) view_3 = BigQueryView( dataset_id="dataset_3", view_id="table_3", description="table_3 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_1.table_1` JOIN `{project_id}.dataset_2.table_2` USING (col)""", ) view_4 = BigQueryView( dataset_id="dataset_4", view_id="table_4", description="table_4 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_2.table_2` JOIN `{project_id}.dataset_3.table_3` USING (col)""", ) dag_walker = BigQueryViewDagWalker([view_1, view_2, view_3, view_4]) start_node = dag_walker.node_for_view(view_4) dag_walker.populate_node_family_for_node( node=start_node, view_source_table_datasets={"source_dataset"} ) expected_parent_nodes = { DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table" ) ), DagKey.for_view(view_1), DagKey.for_view(view_2), DagKey.for_view(view_3), } self.assertEqual(expected_parent_nodes, start_node.node_family.full_parentage)
def print_dfs_tree(dataset_id: str, view_id: str, print_downstream_tree: bool = False) -> None: dag_walker = build_dag_walker(dataset_id, view_id) node = dag_walker.nodes_by_key[DagKey( view_address=BigQueryAddress(dataset_id=dataset_id, table_id=view_id))] dag_walker.populate_node_family_for_node(node=node) print(node.node_family.child_dfs_tree_str if print_downstream_tree else node.node_family.parent_dfs_tree_str)
def _get_all_views_to_document(self) -> Set[DagKey]: """Retrieve all DAG Walker views that we want to document""" all_nodes = self.dag_walker.nodes_by_key.values() all_view_keys = { DagKey(view_address=node.view.address) for node in all_nodes if node.dag_key.dataset_id not in DATASETS_TO_SKIP_VIEW_DOCUMENTATION } return all_view_keys
def test_parse_simple_view(self) -> None: view = BigQueryView( dataset_id="my_dataset", view_id="my_view_id", description="my view description", view_query_template="SELECT * FROM `{project_id}.some_dataset.some_table`", ) node = BigQueryViewDagNode(view) self.assertIsNone(view.materialized_address) node.set_materialized_addresss({}) self.assertEqual(node.is_root, False) self.assertEqual( node.dag_key, DagKey( view_address=BigQueryAddress( dataset_id="my_dataset", table_id="my_view_id" ) ), ) self.assertEqual( node.parent_keys, { DagKey( view_address=BigQueryAddress( dataset_id="some_dataset", table_id="some_table" ) ) }, ) self.assertEqual(node.child_keys, set()) node.is_root = True child_key = DagKey( view_address=BigQueryAddress( dataset_id="other_dataset", table_id="other_table" ) ) node.add_child_key(child_key) self.assertEqual(node.is_root, True) self.assertEqual(node.child_keys, {child_key})
def _preprocess_views( v: BigQueryView, _parent_results: Dict[BigQueryView, None]) -> None: dag_key = DagKey(view_address=v.address) node = self.dag_walker.nodes_by_key[dag_key] # Fills out full child/parent dependencies and tree representations for use # in various sections. self.dag_walker.populate_node_family_for_node( node=node, datasets_to_skip={DATAFLOW_METRICS_MATERIALIZED_DATASET} | RAW_TABLE_DATASETS, custom_node_formatter=self. _dependency_tree_formatter_for_gitbook, view_source_table_datasets=VIEW_SOURCE_TABLE_DATASETS | LATEST_VIEW_DATASETS, )
def process_check_using_materialized( view: BigQueryView, _parent_results: Dict[BigQueryView, None] ) -> None: node = walker.node_for_view(view) for parent_table_address in node.parent_tables: if parent_table_address in walker.materialized_addresss: # We are using materialized version of a table continue parent_key = DagKey(view_address=parent_table_address) if parent_key not in walker.nodes_by_key: # We assume this is a source data table (checked in other tests) continue parent_view: BigQueryView = walker.view_for_key(parent_key) self.assertIsNone( parent_view.materialized_address, f"Found view [{node.dag_key}] referencing un-materialized version " f"of view [{parent_key}] when materialized table " f"[{parent_view.materialized_address}] exists.", )
def _get_all_parent_keys_for_product( self, product: ProductConfig) -> Set[DagKey]: """Returns a set containing a DagKey for every view that this product relies upon. """ all_config_view_addresses = self._get_all_config_view_addresses_for_product( product) all_parent_keys: Set[DagKey] = set() for view_address in all_config_view_addresses: dag_key = DagKey(view_address=view_address) node = self.dag_walker.nodes_by_key[dag_key] # Add in the top level view all_parent_keys.add(dag_key) # Add in all ancestors all_parent_keys = all_parent_keys.union( node.node_family.full_parentage) # Ignore materialized metric views as relevant metric info can be found in a # different dataset (DATAFLOW_METRICS_DATASET). all_parent_keys.difference_update({ key for key in all_parent_keys if key.dataset_id == DATAFLOW_METRICS_MATERIALIZED_DATASET }) return all_parent_keys
def test_populate_node_family_full_parentage(self) -> None: dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list) # root node start start_node = dag_walker.node_for_view(self.x_shaped_dag_views_list[0]) dag_walker.populate_node_family_for_node( node=start_node, view_source_table_datasets={"source_dataset"} ) self.assertEqual( { DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table" ) ) }, start_node.node_family.full_parentage, ) # start in middle start_node = dag_walker.node_for_view(self.x_shaped_dag_views_list[2]) dag_walker.populate_node_family_for_node( node=start_node, view_source_table_datasets={"source_dataset"} ) expected_parent_nodes = { DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table" ) ), DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table_2" ) ), DagKey.for_view(self.x_shaped_dag_views_list[0]), DagKey.for_view(self.x_shaped_dag_views_list[1]), } self.assertEqual( expected_parent_nodes, start_node.node_family.full_parentage, ) # single start node start_node = dag_walker.node_for_view(self.x_shaped_dag_views_list[3]) dag_walker.populate_node_family_for_node( node=start_node, view_source_table_datasets={"source_dataset"} ) expected_parent_nodes = { DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table" ) ), DagKey( view_address=BigQueryAddress( dataset_id="source_dataset", table_id="source_table_2" ) ), DagKey.for_view(self.x_shaped_dag_views_list[0]), DagKey.for_view(self.x_shaped_dag_views_list[1]), DagKey.for_view(self.x_shaped_dag_views_list[2]), } self.assertEqual( expected_parent_nodes, start_node.node_family.full_parentage, ) # multiple start nodes start_nodes = [ start_node, dag_walker.node_for_view(self.x_shaped_dag_views_list[4]), ] parentage_set: Set[DagKey] = set() for node in start_nodes: dag_walker.populate_node_family_for_node( node=node, view_source_table_datasets={"source_dataset"} ) parentage_set = parentage_set.union(node.node_family.full_parentage) self.assertEqual(expected_parent_nodes, parentage_set)
def process_simple( view: BigQueryView, _parent_results: Dict[BigQueryView, DagKey] ) -> DagKey: time.sleep(MOCK_VIEW_PROCESS_TIME_SECONDS) return DagKey.for_view(view)