Exemple #1
0
def _create_dataset_and_deploy_views(
    views_to_update: List[BigQueryView], set_temp_dataset_table_expiration: bool = False
) -> None:
    """Create and update the given views and their parent datasets.

    For each dataset key in the given dictionary, creates the dataset if it does not exist, and creates or updates the
    underlying views mapped to that dataset.

    If a view has a set materialized_view_table_id field, materializes the view into a table.

    Args:
        views_to_update: A list of view objects to be created or updated.
        set_temp_dataset_table_expiration: If True, new datasets will be created with an expiration of
            TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS.
    """

    bq_client = BigQueryClientImpl()
    _create_all_datasets_if_necessary(
        bq_client, views_to_update, set_temp_dataset_table_expiration
    )

    dag_walker = BigQueryViewDagWalker(views_to_update)

    def process_fn(v: BigQueryView, parent_results: Dict[BigQueryView, bool]) -> bool:
        """Returns True if this view or any of its parents were updated."""
        return _create_or_update_view_and_materialize_if_necessary(
            bq_client, v, parent_results
        )

    dag_walker.process_dag(process_fn)
    def test_dag_exception_handling(self) -> None:
        """Test that exceptions during processing propagate properly."""

        class TestDagWalkException(ValueError):
            pass

        walker = BigQueryViewDagWalker(self.all_views)

        def process_throws(
            _view: BigQueryView, _parent_results: Dict[BigQueryView, None]
        ) -> None:
            raise TestDagWalkException()

        with self.assertRaises(TestDagWalkException):
            _ = walker.process_dag(process_throws)

        def process_throws_after_root(
            view: BigQueryView, _parent_results: Dict[BigQueryView, DagKey]
        ) -> DagKey:
            node_key = (view.dataset_id, view.view_id)
            node = walker.nodes_by_key[node_key]
            if not node.is_root:
                raise TestDagWalkException()
            return node_key

        with self.assertRaises(TestDagWalkException):
            _ = walker.process_dag(process_throws_after_root)
    def test_get_sub_dag_leaf_node(self) -> None:
        all_views_dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list)

        # Get descendants sub-dag
        sub_dag = all_views_dag_walker.get_descendants_sub_dag(
            [self.x_shaped_dag_views_list[4]],
        )

        # Only should include this view
        expected_views = [self.x_shaped_dag_views_list[4]]

        self.assertCountEqual(expected_views, sub_dag.views)

        # Get ancestors sub-dag
        sub_dag = all_views_dag_walker.get_ancestors_sub_dag(
            [self.x_shaped_dag_views_list[4]]
        )

        expected_views = [
            v
            for v in self.x_shaped_dag_views_list
            # This view does not depend on other leaf view "table_4"
            if v.view_id != "table_4"
        ]

        self.assertCountEqual(expected_views, sub_dag.views)
    def test_get_sub_dag_multiple_input_views2(self) -> None:
        all_views_dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list)

        input_views = [
            self.x_shaped_dag_views_list[2],
            self.x_shaped_dag_views_list[4],
        ]

        # Get descendants sub-dag
        sub_dag = all_views_dag_walker.get_descendants_sub_dag(
            input_views,
        )

        # Only should include this view
        expected_views = [
            self.x_shaped_dag_views_list[2],
            self.x_shaped_dag_views_list[3],
            self.x_shaped_dag_views_list[4],
        ]

        self.assertCountEqual(expected_views, sub_dag.views)

        # Get ancestors sub-dag
        sub_dag = all_views_dag_walker.get_ancestors_sub_dag(input_views)

        expected_views = [
            self.x_shaped_dag_views_list[0],
            self.x_shaped_dag_views_list[1],
            self.x_shaped_dag_views_list[2],
            self.x_shaped_dag_views_list[4],
        ]

        self.assertCountEqual(expected_views, sub_dag.views)
    def test_views_use_materialized_if_present(self) -> None:
        """Checks that each view is using the materialized version of a parent view, if
        one exists."""
        walker = BigQueryViewDagWalker(self.all_views)

        def process_check_using_materialized(
            view: BigQueryView, _parent_results: Dict[BigQueryView, None]
        ) -> None:
            node = walker.node_for_view(view)
            for parent_table_address in node.parent_tables:
                if parent_table_address in walker.materialized_addresss:
                    # We are using materialized version of a table
                    continue
                parent_key = DagKey(view_address=parent_table_address)
                if parent_key not in walker.nodes_by_key:
                    # We assume this is a source data table (checked in other tests)
                    continue
                parent_view: BigQueryView = walker.view_for_key(parent_key)
                self.assertIsNone(
                    parent_view.materialized_address,
                    f"Found view [{node.dag_key}] referencing un-materialized version "
                    f"of view [{parent_key}] when materialized table "
                    f"[{parent_view.materialized_address}] exists.",
                )

        result = walker.process_dag(process_check_using_materialized)
        self.assertEqual(len(self.all_views), len(result))
    def test_dag_returns_parent_results(self) -> None:
        walker = BigQueryViewDagWalker(self.all_views)

        def process_check_parents(
            _view: BigQueryView, parent_results: Dict[BigQueryView, int]
        ) -> int:
            if not parent_results:
                return 1
            return max(parent_results.values()) + 1

        result = walker.process_dag(process_check_parents)
        self.assertEqual(len(self.all_views), len(result))

        max_depth = 0
        max_depth_view = None
        for view, depth in result.items():
            if depth > max_depth:
                max_depth = depth
                max_depth_view = view
        if not max_depth_view:
            self.fail("Found no max_depth_view")
        max_depth_node = walker.nodes_by_key[
            (max_depth_view.dataset_id, max_depth_view.view_id)
        ]
        self.assertEqual(set(), max_depth_node.child_keys)
    def test_dag_process_time(self) -> None:
        num_views = len(self.all_views)
        walker = BigQueryViewDagWalker(self.all_views)

        serial_processing_time_seconds = num_views * MOCK_VIEW_PROCESS_TIME_SECONDS
        serial_processing_time = datetime.timedelta(
            seconds=serial_processing_time_seconds
        )

        def process_simple(
            view: BigQueryView, _parent_results: Dict[BigQueryView, DagKey]
        ) -> DagKey:
            time.sleep(MOCK_VIEW_PROCESS_TIME_SECONDS)
            return view.dataset_id, view.table_id

        start = datetime.datetime.now()
        result = walker.process_dag(process_simple)
        end = datetime.datetime.now()

        self.assertEqual(num_views, len(result))

        processing_time = end - start

        # We expect to see significant speedup over the processing time if we ran the process function for each view
        # in series.
        self.assertLess(processing_time * 5, serial_processing_time)
    def test_union_dags_same_view_different_object(self) -> None:
        view = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset_1", table_id="other_table_1"
            ),
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )

        unioned_dag = BigQueryViewDagWalker.union_dags(
            BigQueryViewDagWalker([view]),
            BigQueryViewDagWalker(
                [
                    BigQueryView(
                        dataset_id="dataset_1",
                        view_id="table_1",
                        description="table_1 description",
                        should_materialize=True,
                        materialized_address_override=BigQueryAddress(
                            dataset_id="other_dataset_1", table_id="other_table_1"
                        ),
                        view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
                    )
                ]
            ),
        )

        self.assertCountEqual([view], unioned_dag.views)
    def test_sub_dag_with_cycle(self) -> None:
        all_views_dag_walker = BigQueryViewDagWalker(self.diamond_shaped_dag_views_list)

        input_views = [
            self.diamond_shaped_dag_views_list[1],
            self.diamond_shaped_dag_views_list[4],
        ]

        # Get descendants sub-dag
        sub_dag = all_views_dag_walker.get_descendants_sub_dag(input_views)

        expected_views = [
            self.diamond_shaped_dag_views_list[1],
            self.diamond_shaped_dag_views_list[2],
            self.diamond_shaped_dag_views_list[3],
            self.diamond_shaped_dag_views_list[4],
            self.diamond_shaped_dag_views_list[5],
        ]

        self.assertCountEqual(expected_views, sub_dag.views)

        # Get ancestors sub-dag
        sub_dag = all_views_dag_walker.get_ancestors_sub_dag(
            input_views,
        )

        expected_views = [
            self.diamond_shaped_dag_views_list[0],
            self.diamond_shaped_dag_views_list[1],
            self.diamond_shaped_dag_views_list[2],
            self.diamond_shaped_dag_views_list[4],
        ]

        self.assertCountEqual(expected_views, sub_dag.views)
Exemple #10
0
def rematerialize_views_for_namespace(
    # TODO(#5785): Clarify use case of BigQueryViewNamespace filter (see ticket for more)
    bq_view_namespace: BigQueryViewNamespace,
    candidate_view_builders: Sequence[BigQueryViewBuilder],
    dataset_overrides: Optional[Dict[str, str]] = None,
    skip_missing_views: bool = False,
) -> None:
    """For all views in a given namespace, re-materializes any materialized views. This should be called only when we
    want to refresh the data in the materialized view, not when we want to update the underlying query of the view.
    """
    set_default_table_expiration_for_new_datasets = bool(dataset_overrides)
    if set_default_table_expiration_for_new_datasets:
        logging.info(
            "Found non-empty dataset overrides. New datasets created in this process will have a "
            "default table expiration of 24 hours."
        )

    try:
        views_to_update = _build_views_to_update(
            candidate_view_builders=candidate_view_builders,
            dataset_overrides=dataset_overrides,
        )

        bq_client = BigQueryClientImpl()
        _create_all_datasets_if_necessary(
            bq_client, views_to_update, set_default_table_expiration_for_new_datasets
        )

        dag_walker = BigQueryViewDagWalker(views_to_update)

        def _materialize_view(
            v: BigQueryView, _parent_results: Dict[BigQueryView, None]
        ) -> None:
            if not v.materialized_view_table_id:
                logging.info(
                    "Skipping non-materialized view [%s.%s].", v.dataset_id, v.view_id
                )
                return

            if skip_missing_views and not bq_client.table_exists(
                bq_client.dataset_ref_for_id(dataset_id=v.dataset_id), v.view_id
            ):
                logging.info(
                    "Skipping materialization of view [%s.%s] which does not exist",
                    v.dataset_id,
                    v.view_id,
                )
                return

            bq_client.materialize_view_to_table(v)

        dag_walker.process_dag(_materialize_view)

    except Exception as e:
        with monitoring.measurements(
            {monitoring.TagKey.CREATE_UPDATE_VIEWS_NAMESPACE: bq_view_namespace.value}
        ) as measurements:
            measurements.measure_int_put(m_failed_view_update, 1)
        raise e from e
    def test_get_sub_dag_input_views_not_all_in_input_dag(self) -> None:
        all_views_dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list[0:2])

        with self.assertRaises(ValueError) as e:
            # Add different input views to build source dag
            _ = all_views_dag_walker.get_descendants_sub_dag(
                self.x_shaped_dag_views_list[1:5],
            )

        self.assertTrue(
            str(e.exception).startswith("Found input views not in source DAG:")
        )
    def test_union_dags(self) -> None:
        x_shaped_dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list)

        # This DAG is a superset of the X-shaped DAG
        diamond_shaped_dag_walker = BigQueryViewDagWalker(
            self.diamond_shaped_dag_views_list
        )
        unioned_dag = BigQueryViewDagWalker.union_dags(
            x_shaped_dag_walker, diamond_shaped_dag_walker
        )

        self.assertCountEqual(self.diamond_shaped_dag_views_list, unioned_dag.views)
    def test_get_sub_dag_empty_input_views(self) -> None:
        all_views_dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list)

        # Get descendants sub-dag
        sub_dag = all_views_dag_walker.get_descendants_sub_dag([])

        self.assertCountEqual([], sub_dag.views)

        # Get ancestors sub-dag
        sub_dag = all_views_dag_walker.get_ancestors_sub_dag([])

        self.assertCountEqual([], sub_dag.views)
    def test_dag_parents_materialized_non_default(self) -> None:
        self.maxDiff = None
        view_1 = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset_1", table_id="other_table_1"
            ),
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )
        view_2 = BigQueryView(
            dataset_id="dataset_2",
            view_id="table_2",
            description="table_2 description",
            should_materialize=True,
            materialized_address_override=BigQueryAddress(
                dataset_id="other_dataset_2", table_id="other_table_2"
            ),
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`",
        )
        view_3 = BigQueryView(
            dataset_id="dataset_3",
            view_id="table_3",
            description="table_3 description",
            view_query_template="""
                SELECT * FROM `{project_id}.dataset_1.table_1`
                JOIN `{project_id}.other_dataset_2.other_table_2`
                USING (col)""",
        )
        walker = BigQueryViewDagWalker([view_1, view_2, view_3])

        def process_simple(
            view: BigQueryView, parent_results: Dict[BigQueryView, DagKey]
        ) -> str:
            if view == view_3:
                # View 3 should have two parents
                self.assertEqual(
                    {view_1: view_1.view_id, view_2: view_2.view_id}, parent_results
                )

            return view.view_id

        result = walker.process_dag(process_simple)
        self.assertEqual(
            {view_1: view_1.view_id, view_2: view_2.view_id, view_3: view_3.view_id},
            result,
        )
    def test_populate_node_family_full_parentage_complex_dependencies(self) -> None:
        view_1 = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            description="table_1 description",
            view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
        )
        view_2 = BigQueryView(
            dataset_id="dataset_2",
            view_id="table_2",
            description="table_2 description",
            view_query_template="SELECT * FROM `{project_id}.dataset_1.table_1`",
        )
        view_3 = BigQueryView(
            dataset_id="dataset_3",
            view_id="table_3",
            description="table_3 description",
            view_query_template="""
                           SELECT * FROM `{project_id}.dataset_1.table_1`
                           JOIN `{project_id}.dataset_2.table_2`
                           USING (col)""",
        )
        view_4 = BigQueryView(
            dataset_id="dataset_4",
            view_id="table_4",
            description="table_4 description",
            view_query_template="""
                           SELECT * FROM `{project_id}.dataset_2.table_2`
                           JOIN `{project_id}.dataset_3.table_3`
                           USING (col)""",
        )

        dag_walker = BigQueryViewDagWalker([view_1, view_2, view_3, view_4])
        start_node = dag_walker.node_for_view(view_4)

        dag_walker.populate_node_family_for_node(
            node=start_node, view_source_table_datasets={"source_dataset"}
        )
        expected_parent_nodes = {
            DagKey(
                view_address=BigQueryAddress(
                    dataset_id="source_dataset", table_id="source_table"
                )
            ),
            DagKey.for_view(view_1),
            DagKey.for_view(view_2),
            DagKey.for_view(view_3),
        }
        self.assertEqual(expected_parent_nodes, start_node.node_family.full_parentage)
    def test_dag_touches_all_views(self) -> None:
        walker = BigQueryViewDagWalker(self.all_views)

        def process_simple(
            view: BigQueryView, _parent_results: Dict[BigQueryView, DagKey]
        ) -> DagKey:
            time.sleep(MOCK_VIEW_PROCESS_TIME_SECONDS / 10)
            return DagKey.for_view(view)

        result = walker.process_dag(process_simple)

        expected_view_keys = set(walker.nodes_by_key)
        self.assertEqual(expected_view_keys, set(result.values()))

        walked_view_keys_from_process_results = set(result.values())
        self.assertEqual(expected_view_keys, walked_view_keys_from_process_results)
 def test_dag_with_cycle_after_root(self) -> None:
     view_1 = BigQueryView(
         dataset_id="dataset_1",
         view_id="table_1",
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
     )
     view_2 = BigQueryView(
         dataset_id="dataset_2",
         view_id="table_2",
         view_query_template="""
         SELECT * FROM `{project_id}.dataset_1.table_1`
         JOIN `{project_id}.dataset_3.table_3`
         USING (col)""",
     )
     view_3 = BigQueryView(
         dataset_id="dataset_3",
         view_id="table_3",
         view_query_template="SELECT * FROM `{project_id}.dataset_2.table_2`",
     )
     with self.assertRaises(ValueError) as e:
         _ = BigQueryViewDagWalker([view_1, view_2, view_3])
     self.assertEqual(
         str(e.exception),
         "Detected cycle in graph reachable from ('dataset_1', 'table_1'): "
         "[('dataset_2', 'table_2'), ('dataset_3', 'table_3')]",
     )
 def test_dag_two_views_same_materialized_address(self) -> None:
     view_1 = BigQueryView(
         dataset_id="dataset_1",
         view_id="table_1",
         description="table_1 description",
         should_materialize=True,
         materialized_address_override=BigQueryAddress(
             dataset_id="other_dataset", table_id="other_table"
         ),
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
     )
     view_2 = BigQueryView(
         dataset_id="dataset_2",
         view_id="table_2",
         description="table_2 description",
         should_materialize=True,
         materialized_address_override=BigQueryAddress(
             dataset_id="other_dataset", table_id="other_table"
         ),
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`",
     )
     with self.assertRaises(ValueError) as e:
         _ = BigQueryViewDagWalker([view_1, view_2])
     self.assertTrue(
         str(e.exception).startswith(
             "Found materialized view address for view [('dataset_2', 'table_2')] "
             "that matches materialized_address of another view: "
             "[('dataset_1', 'table_1')]."
         )
     )
def _create_managed_dataset_and_deploy_views(
    views_to_update: List[BigQueryView],
    bq_region_override: Optional[str],
    force_materialize: bool,
    set_temp_dataset_table_expiration: bool = False,
) -> None:
    """Create and update the given views and their parent datasets. Cleans up unmanaged views and datasets

    For each dataset key in the given dictionary, creates  the dataset if it does not
    exist, and creates or updates the underlying views mapped to that dataset.

    If a view has a set materialized_address field, materializes the view into a
    table.

    Then, cleans up BigQuery by deleting unmanaged datasets and unmanaged views within managed datasets. This is not
    performed if a temporary dataset table expiration is already set.

    Args:
        views_to_update: A list of view objects to be created or updated.
        set_temp_dataset_table_expiration: If True, new datasets will be created with an expiration
            of TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS.
    """
    bq_client = BigQueryClientImpl(region_override=bq_region_override)
    dag_walker = BigQueryViewDagWalker(views_to_update)

    managed_views_map = get_managed_view_and_materialized_table_addresses_by_dataset(
        dag_walker)
    managed_dataset_ids = list(managed_views_map.keys())
    _create_all_datasets_if_necessary(bq_client, managed_dataset_ids,
                                      set_temp_dataset_table_expiration)

    if not set_temp_dataset_table_expiration:
        # We don't want to be deleting unmanaged views/tables if we're creating sandbox datasets
        cleanup_datasets_and_delete_unmanaged_views(bq_client,
                                                    managed_views_map,
                                                    dry_run=False)

    def process_fn(v: BigQueryView, parent_results: Dict[BigQueryView,
                                                         bool]) -> bool:
        """Returns True if this view or any of its parents were updated."""
        return _create_or_update_view_and_materialize_if_necessary(
            bq_client, v, parent_results, force_materialize)

    dag_walker.process_dag(process_fn)
 def test_get_managed_views_for_dataset_map_empty_list(self) -> None:
     # Arrange
     walker = BigQueryViewDagWalker(self.empty_view_list)
     # Act
     result_dict = get_managed_view_and_materialized_table_addresses_by_dataset(
         walker
     )
     # Assert
     expected_result: Dict[str, Set[BigQueryAddress]] = {}
     self.assertEqual(expected_result, result_dict)
    def test_get_sub_dag_single_node_input(self) -> None:
        all_views_dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list[0:1])

        # Get descendants sub-dag
        sub_dag = all_views_dag_walker.get_descendants_sub_dag(
            self.x_shaped_dag_views_list[0:1],
        )

        expected_views = self.x_shaped_dag_views_list[0:1]

        self.assertCountEqual(expected_views, sub_dag.views)

        # Get ancestors sub-dag
        sub_dag = all_views_dag_walker.get_ancestors_sub_dag(
            self.x_shaped_dag_views_list[0:1]
        )

        # Only should include this view
        expected_views = self.x_shaped_dag_views_list[0:1]

        self.assertCountEqual(expected_views, sub_dag.views)
    def test_dag_does_not_process_until_parents_processed(self) -> None:
        walker = BigQueryViewDagWalker(self.all_views)

        mutex = threading.Lock()
        all_processed = set()

        def process_check_parents(
            view: BigQueryView, parent_results: Dict[BigQueryView, None]
        ) -> None:
            with mutex:
                node_key = (view.dataset_id, view.view_id)
                node = walker.nodes_by_key[node_key]
                if not node.is_root:
                    for parent_key in node.parent_keys:
                        if parent_key not in all_processed:
                            # The only parents that won't have been fully processed are source data tables
                            try:
                                self.assertIsValidSourceDataTable(
                                    child_view_key=node_key, source_table_key=parent_key
                                )
                            except ValueError as e:
                                raise ValueError(
                                    f"Found parent view [{parent_key}] that was not processed before "
                                    f"child [{node_key}] started processing."
                                ) from e
                        else:
                            self.assertIn(
                                walker.nodes_by_key[parent_key].view, parent_results
                            )

            time.sleep(
                random.uniform(
                    MOCK_VIEW_PROCESS_TIME_SECONDS, MOCK_VIEW_PROCESS_TIME_SECONDS * 2
                )
            )
            with mutex:
                all_processed.add(node_key)

        result = walker.process_dag(process_check_parents)
        self.assertEqual(len(self.all_views), len(result))
    def test_dag_init(self) -> None:
        walker = BigQueryViewDagWalker(self.all_views)

        self.assertEqual(len(self.all_views), len(walker.nodes_by_key))

        for key, node in walker.nodes_by_key.items():
            if not node.parent_keys:
                self.assertIsValidEmptyParentsView(node)

            if node.is_root:
                for parent_key in node.parent_keys:
                    # Root views should only have source data tables as parents
                    self.assertIsValidSourceDataTable(key, parent_key)
Exemple #24
0
def main() -> None:
    """Executes the main flow of the script."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--project-id",
        choices=[GCP_PROJECT_STAGING, GCP_PROJECT_PRODUCTION],
        help=
        "Used to select which GCP project against which to run this script.",
        required=True,
    )
    parser.add_argument(
        "--dry-run",
        default=True,
        type=str_to_bool,
        help=
        "Runs delete in dry-run mode, only prints the views/tables it would delete.",
    )
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.INFO)

    with local_project_id_override(args.project_id):
        views = []
        for view_builder in DEPLOYED_VIEW_BUILDERS:
            if view_builder.dataset_id in VIEW_SOURCE_TABLE_DATASETS:
                raise ValueError(
                    f"Found view [{view_builder.view_id}] in source-table-only dataset [{view_builder.dataset_id}]"
                )

            try:
                view = view_builder.build()
            except BigQueryViewBuilderShouldNotBuildError:
                logging.warning(
                    "Condition failed for view builder %s in dataset %s. Continuing without it.",
                    view_builder.view_id,
                    view_builder.dataset_id,
                )
                continue
            views.append(view)

        dag_walker = BigQueryViewDagWalker(views)
        managed_views_map = (
            get_managed_view_and_materialized_table_addresses_by_dataset(
                dag_walker))

        cleanup_datasets_and_delete_unmanaged_views(
            bq_client=BigQueryClientImpl(),
            managed_views_map=managed_views_map,
            dry_run=args.dry_run,
        )
Exemple #25
0
def build_dag_walker(dataset_id: str, view_id: str) -> BigQueryViewDagWalker:
    is_valid_view = False
    for builder in DEPLOYED_VIEW_BUILDERS:
        if (not is_valid_view and builder.dataset_id == dataset_id
                and builder.view_id == view_id):
            is_valid_view = True
    if not is_valid_view:
        raise ValueError(f"invalid view {dataset_id}.{view_id}")
    return BigQueryViewDagWalker(
        _build_views_to_update(
            view_source_table_datasets=VIEW_SOURCE_TABLE_DATASETS,
            candidate_view_builders=DEPLOYED_VIEW_BUILDERS,
            dataset_overrides=None,
        ))
    def test_get_sub_dag_middle_node(self) -> None:
        all_views_dag_walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list)

        # Get descendants sub-dag
        descendants_sub_dag = all_views_dag_walker.get_descendants_sub_dag(
            [self.x_shaped_dag_views_list[2]],
        )

        expected_views = [
            self.x_shaped_dag_views_list[2],
            self.x_shaped_dag_views_list[3],
            self.x_shaped_dag_views_list[4],
        ]

        self.assertCountEqual(expected_views, descendants_sub_dag.views)

        # Get ancestors sub-dag
        ancestors_sub_dag = all_views_dag_walker.get_ancestors_sub_dag(
            [self.x_shaped_dag_views_list[2]]
        )

        expected_views = [
            self.x_shaped_dag_views_list[0],
            self.x_shaped_dag_views_list[1],
            self.x_shaped_dag_views_list[2],
        ]

        self.assertCountEqual(expected_views, ancestors_sub_dag.views)

        # Get both directions sub-dag
        both_directions_dag = BigQueryViewDagWalker.union_dags(
            descendants_sub_dag, ancestors_sub_dag
        )

        expected_views = self.x_shaped_dag_views_list

        self.assertCountEqual(expected_views, both_directions_dag.views)
 def test_get_managed_views_for_dataset_map_x_shaped_dag(self) -> None:
     # Arrange
     walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list)
     # Act
     result_dict = get_managed_view_and_materialized_table_addresses_by_dataset(
         walker
     )
     # Assert
     expected_result: Dict[str, Set[BigQueryAddress]] = {
         "dataset_1": {BigQueryAddress(dataset_id="dataset_1", table_id="table_1")},
         "dataset_2": {BigQueryAddress(dataset_id="dataset_2", table_id="table_2")},
         "dataset_3": {BigQueryAddress(dataset_id="dataset_3", table_id="table_3")},
         "dataset_4": {BigQueryAddress(dataset_id="dataset_4", table_id="table_4")},
         "dataset_5": {BigQueryAddress(dataset_id="dataset_5", table_id="table_5")},
     }
     self.assertEqual(expected_result, result_dict)
 def test_get_managed_views_for_dataset_map_all_views_same_dataset(self) -> None:
     # Arrange
     walker = BigQueryViewDagWalker(self.all_views_same_dataset)
     # Act
     result_dict = get_managed_view_and_materialized_table_addresses_by_dataset(
         walker
     )
     # Assert
     expected_result: Dict[str, Set[BigQueryAddress]] = {
         "dataset_1": {
             BigQueryAddress(dataset_id="dataset_1", table_id="table_1"),
             BigQueryAddress(dataset_id="dataset_1", table_id="table_2"),
             BigQueryAddress(dataset_id="dataset_1", table_id="table_3"),
         },
     }
     self.assertEqual(expected_result, result_dict)
    def test_dag_with_cycle_at_root(self) -> None:
        view_1 = BigQueryView(
            dataset_id="dataset_1",
            view_id="table_1",
            view_query_template="SELECT * FROM `{project_id}.dataset_2.table_2`",
        )
        view_2 = BigQueryView(
            dataset_id="dataset_2",
            view_id="table_2",
            view_query_template="SELECT * FROM `{project_id}.dataset_1.table_1`",
        )

        with self.assertRaises(ValueError) as e:
            _ = BigQueryViewDagWalker([view_1, view_2])

        self.assertEqual(
            str(e.exception), "No roots detected. Input views contain a cycle."
        )
 def test_dag_no_cycle(self) -> None:
     view_1 = BigQueryView(
         dataset_id="dataset_1",
         view_id="table_1",
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`",
     )
     view_2 = BigQueryView(
         dataset_id="dataset_2",
         view_id="table_2",
         view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`",
     )
     view_3 = BigQueryView(
         dataset_id="dataset_3",
         view_id="table_3",
         view_query_template="""
         SELECT * FROM `{project_id}.dataset_1.table_1`
         JOIN `{project_id}.dataset_2.table_2`
         USING (col)""",
     )
     _ = BigQueryViewDagWalker([view_1, view_2, view_3])