def test_get_managed_views_for_dataset_map_empty_list(self) -> None:
     # Arrange
     walker = BigQueryViewDagWalker(self.empty_view_list)
     # Act
     result_dict = get_managed_view_and_materialized_table_addresses_by_dataset(
         walker
     )
     # Assert
     expected_result: Dict[str, Set[BigQueryAddress]] = {}
     self.assertEqual(expected_result, result_dict)
Esempio n. 2
0
def main() -> None:
    """Executes the main flow of the script."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--project-id",
        choices=[GCP_PROJECT_STAGING, GCP_PROJECT_PRODUCTION],
        help=
        "Used to select which GCP project against which to run this script.",
        required=True,
    )
    parser.add_argument(
        "--dry-run",
        default=True,
        type=str_to_bool,
        help=
        "Runs delete in dry-run mode, only prints the views/tables it would delete.",
    )
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.INFO)

    with local_project_id_override(args.project_id):
        views = []
        for view_builder in DEPLOYED_VIEW_BUILDERS:
            if view_builder.dataset_id in VIEW_SOURCE_TABLE_DATASETS:
                raise ValueError(
                    f"Found view [{view_builder.view_id}] in source-table-only dataset [{view_builder.dataset_id}]"
                )

            try:
                view = view_builder.build()
            except BigQueryViewBuilderShouldNotBuildError:
                logging.warning(
                    "Condition failed for view builder %s in dataset %s. Continuing without it.",
                    view_builder.view_id,
                    view_builder.dataset_id,
                )
                continue
            views.append(view)

        dag_walker = BigQueryViewDagWalker(views)
        managed_views_map = (
            get_managed_view_and_materialized_table_addresses_by_dataset(
                dag_walker))

        cleanup_datasets_and_delete_unmanaged_views(
            bq_client=BigQueryClientImpl(),
            managed_views_map=managed_views_map,
            dry_run=args.dry_run,
        )
 def test_get_managed_views_for_dataset_map_all_views_same_dataset(self) -> None:
     # Arrange
     walker = BigQueryViewDagWalker(self.all_views_same_dataset)
     # Act
     result_dict = get_managed_view_and_materialized_table_addresses_by_dataset(
         walker
     )
     # Assert
     expected_result: Dict[str, Set[BigQueryAddress]] = {
         "dataset_1": {
             BigQueryAddress(dataset_id="dataset_1", table_id="table_1"),
             BigQueryAddress(dataset_id="dataset_1", table_id="table_2"),
             BigQueryAddress(dataset_id="dataset_1", table_id="table_3"),
         },
     }
     self.assertEqual(expected_result, result_dict)
 def test_get_managed_views_for_dataset_map_x_shaped_dag(self) -> None:
     # Arrange
     walker = BigQueryViewDagWalker(self.x_shaped_dag_views_list)
     # Act
     result_dict = get_managed_view_and_materialized_table_addresses_by_dataset(
         walker
     )
     # Assert
     expected_result: Dict[str, Set[BigQueryAddress]] = {
         "dataset_1": {BigQueryAddress(dataset_id="dataset_1", table_id="table_1")},
         "dataset_2": {BigQueryAddress(dataset_id="dataset_2", table_id="table_2")},
         "dataset_3": {BigQueryAddress(dataset_id="dataset_3", table_id="table_3")},
         "dataset_4": {BigQueryAddress(dataset_id="dataset_4", table_id="table_4")},
         "dataset_5": {BigQueryAddress(dataset_id="dataset_5", table_id="table_5")},
     }
     self.assertEqual(expected_result, result_dict)
def _create_managed_dataset_and_deploy_views(
    views_to_update: List[BigQueryView],
    bq_region_override: Optional[str],
    force_materialize: bool,
    set_temp_dataset_table_expiration: bool = False,
) -> None:
    """Create and update the given views and their parent datasets. Cleans up unmanaged views and datasets

    For each dataset key in the given dictionary, creates  the dataset if it does not
    exist, and creates or updates the underlying views mapped to that dataset.

    If a view has a set materialized_address field, materializes the view into a
    table.

    Then, cleans up BigQuery by deleting unmanaged datasets and unmanaged views within managed datasets. This is not
    performed if a temporary dataset table expiration is already set.

    Args:
        views_to_update: A list of view objects to be created or updated.
        set_temp_dataset_table_expiration: If True, new datasets will be created with an expiration
            of TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS.
    """
    bq_client = BigQueryClientImpl(region_override=bq_region_override)
    dag_walker = BigQueryViewDagWalker(views_to_update)

    managed_views_map = get_managed_view_and_materialized_table_addresses_by_dataset(
        dag_walker)
    managed_dataset_ids = list(managed_views_map.keys())
    _create_all_datasets_if_necessary(bq_client, managed_dataset_ids,
                                      set_temp_dataset_table_expiration)

    if not set_temp_dataset_table_expiration:
        # We don't want to be deleting unmanaged views/tables if we're creating sandbox datasets
        cleanup_datasets_and_delete_unmanaged_views(bq_client,
                                                    managed_views_map,
                                                    dry_run=False)

    def process_fn(v: BigQueryView, parent_results: Dict[BigQueryView,
                                                         bool]) -> bool:
        """Returns True if this view or any of its parents were updated."""
        return _create_or_update_view_and_materialize_if_necessary(
            bq_client, v, parent_results, force_materialize)

    dag_walker.process_dag(process_fn)
    def test_cleanup_datasets_and_delete_unmanaged_views_dry_run(
        self, mock_get_datasets_that_have_ever_been_managed: mock.MagicMock
    ) -> None:
        mock_get_datasets_that_have_ever_been_managed.return_value = {
            "dataset_1",
            "dataset_2",
            "dataset_unmanaged",
        }

        dataset = bigquery.dataset.DatasetReference(self.project_id, "dataset_1")
        dataset_2 = bigquery.dataset.DatasetReference(self.project_id, "dataset_2")
        dataset_3 = bigquery.dataset.DatasetReference(
            self.project_id, "dataset_unmanaged"
        )

        mock_view_builders = [
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_1",
                view_id="my_fake_view",
                description="my_fake_view description",
                view_query_template="SELECT NULL LIMIT 0",
                should_materialize=False,
            ),
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_2",
                view_id="my_fake_view_2",
                description="my_fake_view_2 description",
                view_query_template="SELECT NULL LIMIT 0",
                should_materialize=False,
            ),
        ]

        mock_table_resource_ds_1_table = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_1",
                "tableId": "my_fake_view",
            },
        }

        mock_table_resource_ds_1_table_bogus = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_1",
                "tableId": "bogus_view_1",
            },
        }

        mock_table_resource_ds_2_table = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_2",
                "tableId": "my_fake_view_2",
            },
        }

        mock_table_resource_ds_2_table_bogus = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_2",
                "tableId": "bogus_view_2",
            },
        }

        def mock_list_tables(dataset_id: str) -> bigquery.table.TableListItem:
            if dataset_id == dataset.dataset_id:
                return [
                    bigquery.table.TableListItem(mock_table_resource_ds_1_table),
                    bigquery.table.TableListItem(mock_table_resource_ds_1_table_bogus),
                ]
            if dataset_id == dataset_2.dataset_id:
                return [
                    bigquery.table.TableListItem(mock_table_resource_ds_2_table),
                    bigquery.table.TableListItem(mock_table_resource_ds_2_table_bogus),
                ]
            raise ValueError(f"No tables for id: {dataset_id}")

        self.mock_client.list_tables.side_effect = mock_list_tables

        def get_dataset_ref(dataset_id: str) -> bigquery.dataset.DatasetReference:
            if dataset_id == dataset.dataset_id:
                return dataset
            if dataset_id == dataset_2.dataset_id:
                return dataset_2
            if dataset_id == dataset_3.dataset_id:
                return dataset_3
            raise ValueError(f"No dataset for id: {dataset_id}")

        self.mock_client.dataset_ref_for_id.side_effect = get_dataset_ref
        self.mock_client.dataset_exists.return_value = True

        views_to_update = [view_builder.build() for view_builder in mock_view_builders]

        dag_walker = BigQueryViewDagWalker(views_to_update)

        managed_views_map = (
            get_managed_view_and_materialized_table_addresses_by_dataset(dag_walker)
        )

        cleanup_datasets_and_delete_unmanaged_views(
            self.mock_client, managed_views_map, dry_run=True
        )

        self.mock_client.delete_dataset.assert_not_called()
        self.mock_client.list_tables.assert_called()
        self.mock_client.delete_table.assert_not_called()
    def test_cleanup_datasets_and_delete_unmanaged_views_unmanaged_dataset_and_dataset_not_in_BigQuery(
        self, mock_get_datasets_that_have_ever_been_managed: mock.MagicMock
    ) -> None:
        mock_get_datasets_that_have_ever_been_managed.return_value = {
            "dataset_1",
            "dataset_unmanaged",
        }

        dataset = bigquery.dataset.DatasetReference(self.project_id, "dataset_1")
        dataset_2 = bigquery.dataset.DatasetReference(
            self.project_id, "dataset_unmanaged"
        )

        mock_view_builders = [
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_1",
                view_id="my_fake_view",
                description="my_fake_view description",
                view_query_template="SELECT NULL LIMIT 0",
                should_materialize=False,
            )
        ]

        mock_table_resource_ds_1_table_1 = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_1",
                "tableId": "my_fake_view",
            },
        }

        self.mock_client.list_tables.return_value = [
            bigquery.table.TableListItem(mock_table_resource_ds_1_table_1),
        ]

        def get_dataset_ref(dataset_id: str) -> bigquery.dataset.DatasetReference:
            if dataset_id == dataset.dataset_id:
                return dataset
            if dataset_id == dataset_2.dataset_id:
                return dataset_2
            raise ValueError(f"No dataset for id: {dataset_id}")

        self.mock_client.dataset_ref_for_id.side_effect = get_dataset_ref

        def mock_dataset_exists(dataset_ref: bigquery.dataset.DatasetReference) -> bool:
            if dataset_ref == dataset:
                return True
            return False

        self.mock_client.dataset_exists.side_effect = mock_dataset_exists

        views_to_update = [view_builder.build() for view_builder in mock_view_builders]

        dag_walker = BigQueryViewDagWalker(views_to_update)

        managed_views_map = (
            get_managed_view_and_materialized_table_addresses_by_dataset(dag_walker)
        )

        with self.assertLogs() as captured_log:
            cleanup_datasets_and_delete_unmanaged_views(
                self.mock_client, managed_views_map, dry_run=False
            )
        self.assertEqual(len(captured_log.records), 1)
        # check that there is only one log message
        self.assertEqual(
            captured_log.records[0].getMessage(),
            "Dataset dataset_unmanaged isn't being managed and no longer exists in BigQuery. It can be safely removed from the list DATASETS_THAT_HAVE_EVER_BEEN_MANAGED.",
        )
        self.mock_client.delete_dataset.assert_not_called()
        self.mock_client.delete_table.assert_not_called()
    def test_cleanup_datasets_and_delete_unmanaged_views_unmanaged_dataset(
        self, mock_get_datasets_that_have_ever_been_managed: mock.MagicMock
    ) -> None:
        mock_get_datasets_that_have_ever_been_managed.return_value = {
            "dataset_1",
            "bogus_dataset",
        }

        dataset = bigquery.dataset.DatasetReference(self.project_id, "dataset_1")
        dataset_2 = bigquery.dataset.DatasetReference(self.project_id, "bogus_dataset")

        sample_views = [
            {
                "view_id": "my_fake_view",
                "view_query": "SELECT NULL LIMIT 0",
            }
        ]
        mock_view_builders = [
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_1",
                description=f"{view['view_id']} description",
                view_query_template="a",
                should_materialize=False,
                materialized_address_override=None,
                should_build_predicate=None,
                **view,
            )
            for view in sample_views
        ]

        mock_table_resource_ds_1_table_1 = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_1",
                "tableId": "my_fake_view",
            },
        }

        self.mock_client.list_tables.return_value = [
            bigquery.table.TableListItem(mock_table_resource_ds_1_table_1),
        ]

        def get_dataset_ref(dataset_id: str) -> bigquery.dataset.DatasetReference:
            if dataset_id == dataset.dataset_id:
                return dataset
            if dataset_id == dataset_2.dataset_id:
                return dataset_2
            raise ValueError(f"No dataset for id: {dataset_id}")

        self.mock_client.dataset_ref_for_id.side_effect = get_dataset_ref
        self.mock_client.dataset_exists.return_value = True

        views_to_update = [view_builder.build() for view_builder in mock_view_builders]

        dag_walker = BigQueryViewDagWalker(views_to_update)

        managed_views_map = (
            get_managed_view_and_materialized_table_addresses_by_dataset(dag_walker)
        )

        cleanup_datasets_and_delete_unmanaged_views(
            self.mock_client, managed_views_map, dry_run=False
        )

        self.mock_client.delete_dataset.assert_called_with(
            self.mock_client.dataset_ref_for_id("bogus_dataset"), delete_contents=True
        )
        self.mock_client.list_tables.assert_called()
        self.mock_client.delete_table.assert_not_called()
def rematerialize_views(
    views_to_update: List[BigQueryView],
    all_view_builders: Sequence[BigQueryViewBuilder],
    view_source_table_datasets: Set[str],
    dataset_overrides: Optional[Dict[str, str]] = None,
    skip_missing_views: bool = False,
    bq_region_override: Optional[str] = None,
) -> None:
    """For all views in the provided |views_to_update| list, re-materializes any
    materialized views. This should be called only when we want to refresh the data in
    the materialized view(s), not when we want to update the underlying query of the
    view(s).

    Args:
        views_to_update: List of views to re-materialize
        all_view_builders: Superset of the views_to_update that contains all views that
            either depend on or are dependents of the list of input views.
        view_source_table_datasets: Set of datasets containing tables that can be
            treated as root nodes in the view dependency graph.
        dataset_overrides: A dictionary mapping dataset_ids to the dataset name they
            should be replaced with for the given list of views_to_update.
        skip_missing_views: If True, ignores any input views that do not exist. If
            False, crashes if tries to materialize a view that does not exist.
        bq_region_override: If set, overrides the region (e.g. us-east1) associated with
            all BigQuery operations.
    """
    set_default_table_expiration_for_new_datasets = bool(dataset_overrides)
    if set_default_table_expiration_for_new_datasets:
        logging.info(
            "Found non-empty dataset overrides. New datasets created in this process will have a "
            "default table expiration of 24 hours.")

    try:
        bq_client = BigQueryClientImpl(region_override=bq_region_override)

        all_views_dag_walker = BigQueryViewDagWalker(
            _build_views_to_update(
                view_source_table_datasets=view_source_table_datasets,
                candidate_view_builders=all_view_builders,
                dataset_overrides=dataset_overrides,
            ))
        dataset_map = get_managed_view_and_materialized_table_addresses_by_dataset(
            all_views_dag_walker)
        _create_all_datasets_if_necessary(
            bq_client,
            list(dataset_map.keys()),
            set_default_table_expiration_for_new_datasets,
        )

        # Limit DAG to only ancestor views and the set of views to update
        ancestors_dag_walker = all_views_dag_walker.get_ancestors_sub_dag(
            views_to_update)

        def _materialize_view(
                v: BigQueryView, _parent_results: Dict[BigQueryView,
                                                       None]) -> None:
            if not v.materialized_address:
                logging.info("Skipping non-materialized view [%s.%s].",
                             v.dataset_id, v.view_id)
                return

            if skip_missing_views and not bq_client.table_exists(
                    bq_client.dataset_ref_for_id(dataset_id=v.dataset_id),
                    v.view_id):
                logging.info(
                    "Skipping materialization of view [%s.%s] which does not exist",
                    v.dataset_id,
                    v.view_id,
                )
                return

            bq_client.materialize_view_to_table(v)

        ancestors_dag_walker.process_dag(_materialize_view)
    except Exception as e:
        with monitoring.measurements() as measurements:
            measurements.measure_int_put(m_failed_view_update, 1)
        raise e from e