def test_parse_view_multiple_parents(self) -> None:
     view = BigQueryView(
         dataset_id="my_dataset",
         view_id="my_view_id",
         description="my view description",
         view_query_template="""SELECT * FROM `{project_id}.some_dataset.some_table`
         LEFT OUTER JOIN `{project_id}.some_dataset.other_table`
         USING (some_col);
         """,
     )
     node = BigQueryViewDagNode(view)
     node.set_materialized_addresss({})
     self.assertEqual(
         node.parent_keys,
         {
             DagKey(
                 view_address=BigQueryAddress(
                     dataset_id="some_dataset", table_id="some_table"
                 )
             ),
             DagKey(
                 view_address=BigQueryAddress(
                     dataset_id="some_dataset", table_id="other_table"
                 )
             ),
         },
     )
 def test_parse_view_materialized_parent(self) -> None:
     view = BigQueryView(
         dataset_id="my_dataset",
         view_id="my_view_id",
         description="my view description",
         view_query_template="SELECT * FROM `{project_id}.some_dataset.some_table_materialized`",
     )
     parent_view = BigQueryView(
         dataset_id="some_dataset",
         view_id="some_table",
         description="my parent view description",
         view_query_template="SELECT * FROM UNNEST([])",
         should_materialize=True,
     )
     node = BigQueryViewDagNode(view)
     if not parent_view.materialized_address:
         raise ValueError("Null materialized_address for view [{parent_view}]")
     node.set_materialized_addresss(
         {parent_view.materialized_address: DagKey.for_view(parent_view)}
     )
     self.assertEqual(
         node.parent_keys,
         {
             DagKey(
                 view_address=BigQueryAddress(
                     dataset_id="some_dataset", table_id="some_table"
                 )
             )
         },
     )
    def test_populate_node_family_full_parentage_complex_dependencies(self) -> None:
        view_1 = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )
        view_2 = BigQueryView(
            dataset_id="dataset_2",
            view_id="table_2",
            description="table_2 description",
            view_query_template="SELECT * FROM `{project_id}.dataset_1.table_1`",
        )
        view_3 = BigQueryView(
            dataset_id="dataset_3",
            view_id="table_3",
            description="table_3 description",
            view_query_template="""
                           SELECT * FROM `{project_id}.dataset_1.table_1`
                           JOIN `{project_id}.dataset_2.table_2`
                           USING (col)""",
        )
        view_4 = BigQueryView(
            dataset_id="dataset_4",
            view_id="table_4",
            description="table_4 description",
            view_query_template="""
                           SELECT * FROM `{project_id}.dataset_2.table_2`
                           JOIN `{project_id}.dataset_3.table_3`
                           USING (col)""",
        )

        dag_walker = BigQueryViewDagWalker([view_1, view_2, view_3, view_4])
        start_node = dag_walker.node_for_view(view_4)

        dag_walker.populate_node_family_for_node(
            node=start_node, view_source_table_datasets={"source_dataset"}
        )
        expected_parent_nodes = {
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="source_dataset", table_id="source_table"
                )
            ),
            DagKey.for_view(view_1),
            DagKey.for_view(view_2),
            DagKey.for_view(view_3),
        }
        self.assertEqual(expected_parent_nodes, start_node.node_family.full_parentage)
Exemple #4
0
def print_dfs_tree(dataset_id: str,
                   view_id: str,
                   print_downstream_tree: bool = False) -> None:
    dag_walker = build_dag_walker(dataset_id, view_id)
    node = dag_walker.nodes_by_key[DagKey(
        view_address=BigQueryAddress(dataset_id=dataset_id, table_id=view_id))]
    dag_walker.populate_node_family_for_node(node=node)
    print(node.node_family.child_dfs_tree_str
          if print_downstream_tree else node.node_family.parent_dfs_tree_str)
 def _get_all_views_to_document(self) -> Set[DagKey]:
     """Retrieve all DAG Walker views that we want to document"""
     all_nodes = self.dag_walker.nodes_by_key.values()
     all_view_keys = {
         DagKey(view_address=node.view.address)
         for node in all_nodes if node.dag_key.dataset_id not in
         DATASETS_TO_SKIP_VIEW_DOCUMENTATION
     }
     return all_view_keys
    def test_parse_simple_view(self) -> None:
        view = BigQueryView(
            dataset_id="my_dataset",
            view_id="my_view_id",
            description="my view description",
            view_query_template="SELECT * FROM `{project_id}.some_dataset.some_table`",
        )
        node = BigQueryViewDagNode(view)
        self.assertIsNone(view.materialized_address)
        node.set_materialized_addresss({})
        self.assertEqual(node.is_root, False)
        self.assertEqual(
            node.dag_key,
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="my_dataset", table_id="my_view_id"
                )
            ),
        )
        self.assertEqual(
            node.parent_keys,
            {
                DagKey(
                    view_address=BigQueryAddress(
                        dataset_id="some_dataset", table_id="some_table"
                    )
                )
            },
        )
        self.assertEqual(node.child_keys, set())

        node.is_root = True
        child_key = DagKey(
            view_address=BigQueryAddress(
                dataset_id="other_dataset", table_id="other_table"
            )
        )
        node.add_child_key(child_key)

        self.assertEqual(node.is_root, True)
        self.assertEqual(node.child_keys, {child_key})
        def _preprocess_views(
                v: BigQueryView, _parent_results: Dict[BigQueryView,
                                                       None]) -> None:
            dag_key = DagKey(view_address=v.address)
            node = self.dag_walker.nodes_by_key[dag_key]

            # Fills out full child/parent dependencies and tree representations for use
            # in various sections.
            self.dag_walker.populate_node_family_for_node(
                node=node,
                datasets_to_skip={DATAFLOW_METRICS_MATERIALIZED_DATASET}
                | RAW_TABLE_DATASETS,
                custom_node_formatter=self.
                _dependency_tree_formatter_for_gitbook,
                view_source_table_datasets=VIEW_SOURCE_TABLE_DATASETS
                | LATEST_VIEW_DATASETS,
            )
 def process_check_using_materialized(
     view: BigQueryView, _parent_results: Dict[BigQueryView, None]
 ) -> None:
     node = walker.node_for_view(view)
     for parent_table_address in node.parent_tables:
         if parent_table_address in walker.materialized_addresss:
             # We are using materialized version of a table
             continue
         parent_key = DagKey(view_address=parent_table_address)
         if parent_key not in walker.nodes_by_key:
             # We assume this is a source data table (checked in other tests)
             continue
         parent_view: BigQueryView = walker.view_for_key(parent_key)
         self.assertIsNone(
             parent_view.materialized_address,
             f"Found view [{node.dag_key}] referencing un-materialized version "
             f"of view [{parent_key}] when materialized table "
             f"[{parent_view.materialized_address}] exists.",
         )
    def _get_all_parent_keys_for_product(
            self, product: ProductConfig) -> Set[DagKey]:
        """Returns a set containing a DagKey for every view that this product relies upon. """
        all_config_view_addresses = self._get_all_config_view_addresses_for_product(
            product)

        all_parent_keys: Set[DagKey] = set()
        for view_address in all_config_view_addresses:
            dag_key = DagKey(view_address=view_address)
            node = self.dag_walker.nodes_by_key[dag_key]
            # Add in the top level view
            all_parent_keys.add(dag_key)
            # Add in all ancestors
            all_parent_keys = all_parent_keys.union(
                node.node_family.full_parentage)

        # Ignore materialized metric views as relevant metric info can be found in a
        # different dataset (DATAFLOW_METRICS_DATASET).
        all_parent_keys.difference_update({
            key
            for key in all_parent_keys
            if key.dataset_id == DATAFLOW_METRICS_MATERIALIZED_DATASET
        })
        return all_parent_keys
    def test_populate_node_family_full_parentage(self) -> None:
        dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list)

        # root node start
        start_node = dag_walker.node_for_view(self.x_shaped_dag_views_list[0])
        dag_walker.populate_node_family_for_node(
            node=start_node, view_source_table_datasets={"source_dataset"}
        )

        self.assertEqual(
            {
                DagKey(
                    view_address=BigQueryAddress(
                        dataset_id="source_dataset", table_id="source_table"
                    )
                )
            },
            start_node.node_family.full_parentage,
        )

        # start in middle
        start_node = dag_walker.node_for_view(self.x_shaped_dag_views_list[2])
        dag_walker.populate_node_family_for_node(
            node=start_node, view_source_table_datasets={"source_dataset"}
        )
        expected_parent_nodes = {
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="source_dataset", table_id="source_table"
                )
            ),
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="source_dataset", table_id="source_table_2"
                )
            ),
            DagKey.for_view(self.x_shaped_dag_views_list[0]),
            DagKey.for_view(self.x_shaped_dag_views_list[1]),
        }
        self.assertEqual(
            expected_parent_nodes,
            start_node.node_family.full_parentage,
        )

        # single start node
        start_node = dag_walker.node_for_view(self.x_shaped_dag_views_list[3])
        dag_walker.populate_node_family_for_node(
            node=start_node, view_source_table_datasets={"source_dataset"}
        )
        expected_parent_nodes = {
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="source_dataset", table_id="source_table"
                )
            ),
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="source_dataset", table_id="source_table_2"
                )
            ),
            DagKey.for_view(self.x_shaped_dag_views_list[0]),
            DagKey.for_view(self.x_shaped_dag_views_list[1]),
            DagKey.for_view(self.x_shaped_dag_views_list[2]),
        }
        self.assertEqual(
            expected_parent_nodes,
            start_node.node_family.full_parentage,
        )

        # multiple start nodes
        start_nodes = [
            start_node,
            dag_walker.node_for_view(self.x_shaped_dag_views_list[4]),
        ]

        parentage_set: Set[DagKey] = set()
        for node in start_nodes:
            dag_walker.populate_node_family_for_node(
                node=node, view_source_table_datasets={"source_dataset"}
            )
            parentage_set = parentage_set.union(node.node_family.full_parentage)

        self.assertEqual(expected_parent_nodes, parentage_set)
 def process_simple(
     view: BigQueryView, _parent_results: Dict[BigQueryView, DagKey]
 ) -> DagKey:
     time.sleep(MOCK_VIEW_PROCESS_TIME_SECONDS)
     return DagKey.for_view(view)