Exemple #1
0
def main() -> None:
    """Executes the main flow of the script."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--project-id",
        choices=[GCP_PROJECT_STAGING, GCP_PROJECT_PRODUCTION],
        help=
        "Used to select which GCP project against which to run this script.",
        required=True,
    )
    parser.add_argument(
        "--dry-run",
        default=True,
        type=str_to_bool,
        help=
        "Runs delete in dry-run mode, only prints the views/tables it would delete.",
    )
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.INFO)

    with local_project_id_override(args.project_id):
        views = []
        for view_builder in DEPLOYED_VIEW_BUILDERS:
            if view_builder.dataset_id in VIEW_SOURCE_TABLE_DATASETS:
                raise ValueError(
                    f"Found view [{view_builder.view_id}] in source-table-only dataset [{view_builder.dataset_id}]"
                )

            try:
                view = view_builder.build()
            except BigQueryViewBuilderShouldNotBuildError:
                logging.warning(
                    "Condition failed for view builder %s in dataset %s. Continuing without it.",
                    view_builder.view_id,
                    view_builder.dataset_id,
                )
                continue
            views.append(view)

        dag_walker = BigQueryViewDagWalker(views)
        managed_views_map = (
            get_managed_view_and_materialized_table_addresses_by_dataset(
                dag_walker))

        cleanup_datasets_and_delete_unmanaged_views(
            bq_client=BigQueryClientImpl(),
            managed_views_map=managed_views_map,
            dry_run=args.dry_run,
        )
def _create_managed_dataset_and_deploy_views(
    views_to_update: List[BigQueryView],
    bq_region_override: Optional[str],
    force_materialize: bool,
    set_temp_dataset_table_expiration: bool = False,
) -> None:
    """Create and update the given views and their parent datasets. Cleans up unmanaged views and datasets

    For each dataset key in the given dictionary, creates  the dataset if it does not
    exist, and creates or updates the underlying views mapped to that dataset.

    If a view has a set materialized_address field, materializes the view into a
    table.

    Then, cleans up BigQuery by deleting unmanaged datasets and unmanaged views within managed datasets. This is not
    performed if a temporary dataset table expiration is already set.

    Args:
        views_to_update: A list of view objects to be created or updated.
        set_temp_dataset_table_expiration: If True, new datasets will be created with an expiration
            of TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS.
    """
    bq_client = BigQueryClientImpl(region_override=bq_region_override)
    dag_walker = BigQueryViewDagWalker(views_to_update)

    managed_views_map = get_managed_view_and_materialized_table_addresses_by_dataset(
        dag_walker)
    managed_dataset_ids = list(managed_views_map.keys())
    _create_all_datasets_if_necessary(bq_client, managed_dataset_ids,
                                      set_temp_dataset_table_expiration)

    if not set_temp_dataset_table_expiration:
        # We don't want to be deleting unmanaged views/tables if we're creating sandbox datasets
        cleanup_datasets_and_delete_unmanaged_views(bq_client,
                                                    managed_views_map,
                                                    dry_run=False)

    def process_fn(v: BigQueryView, parent_results: Dict[BigQueryView,
                                                         bool]) -> bool:
        """Returns True if this view or any of its parents were updated."""
        return _create_or_update_view_and_materialize_if_necessary(
            bq_client, v, parent_results, force_materialize)

    dag_walker.process_dag(process_fn)
    def test_cleanup_datasets_and_delete_unmanaged_views_dry_run(
        self, mock_get_datasets_that_have_ever_been_managed: mock.MagicMock
    ) -> None:
        mock_get_datasets_that_have_ever_been_managed.return_value = {
            "dataset_1",
            "dataset_2",
            "dataset_unmanaged",
        }

        dataset = bigquery.dataset.DatasetReference(self.project_id, "dataset_1")
        dataset_2 = bigquery.dataset.DatasetReference(self.project_id, "dataset_2")
        dataset_3 = bigquery.dataset.DatasetReference(
            self.project_id, "dataset_unmanaged"
        )

        mock_view_builders = [
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_1",
                view_id="my_fake_view",
                description="my_fake_view description",
                view_query_template="SELECT NULL LIMIT 0",
                should_materialize=False,
            ),
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_2",
                view_id="my_fake_view_2",
                description="my_fake_view_2 description",
                view_query_template="SELECT NULL LIMIT 0",
                should_materialize=False,
            ),
        ]

        mock_table_resource_ds_1_table = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_1",
                "tableId": "my_fake_view",
            },
        }

        mock_table_resource_ds_1_table_bogus = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_1",
                "tableId": "bogus_view_1",
            },
        }

        mock_table_resource_ds_2_table = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_2",
                "tableId": "my_fake_view_2",
            },
        }

        mock_table_resource_ds_2_table_bogus = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_2",
                "tableId": "bogus_view_2",
            },
        }

        def mock_list_tables(dataset_id: str) -> bigquery.table.TableListItem:
            if dataset_id == dataset.dataset_id:
                return [
                    bigquery.table.TableListItem(mock_table_resource_ds_1_table),
                    bigquery.table.TableListItem(mock_table_resource_ds_1_table_bogus),
                ]
            if dataset_id == dataset_2.dataset_id:
                return [
                    bigquery.table.TableListItem(mock_table_resource_ds_2_table),
                    bigquery.table.TableListItem(mock_table_resource_ds_2_table_bogus),
                ]
            raise ValueError(f"No tables for id: {dataset_id}")

        self.mock_client.list_tables.side_effect = mock_list_tables

        def get_dataset_ref(dataset_id: str) -> bigquery.dataset.DatasetReference:
            if dataset_id == dataset.dataset_id:
                return dataset
            if dataset_id == dataset_2.dataset_id:
                return dataset_2
            if dataset_id == dataset_3.dataset_id:
                return dataset_3
            raise ValueError(f"No dataset for id: {dataset_id}")

        self.mock_client.dataset_ref_for_id.side_effect = get_dataset_ref
        self.mock_client.dataset_exists.return_value = True

        views_to_update = [view_builder.build() for view_builder in mock_view_builders]

        dag_walker = BigQueryViewDagWalker(views_to_update)

        managed_views_map = (
            get_managed_view_and_materialized_table_addresses_by_dataset(dag_walker)
        )

        cleanup_datasets_and_delete_unmanaged_views(
            self.mock_client, managed_views_map, dry_run=True
        )

        self.mock_client.delete_dataset.assert_not_called()
        self.mock_client.list_tables.assert_called()
        self.mock_client.delete_table.assert_not_called()
    def test_cleanup_datasets_and_delete_unmanaged_views_unmanaged_dataset_and_dataset_not_in_BigQuery(
        self, mock_get_datasets_that_have_ever_been_managed: mock.MagicMock
    ) -> None:
        mock_get_datasets_that_have_ever_been_managed.return_value = {
            "dataset_1",
            "dataset_unmanaged",
        }

        dataset = bigquery.dataset.DatasetReference(self.project_id, "dataset_1")
        dataset_2 = bigquery.dataset.DatasetReference(
            self.project_id, "dataset_unmanaged"
        )

        mock_view_builders = [
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_1",
                view_id="my_fake_view",
                description="my_fake_view description",
                view_query_template="SELECT NULL LIMIT 0",
                should_materialize=False,
            )
        ]

        mock_table_resource_ds_1_table_1 = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_1",
                "tableId": "my_fake_view",
            },
        }

        self.mock_client.list_tables.return_value = [
            bigquery.table.TableListItem(mock_table_resource_ds_1_table_1),
        ]

        def get_dataset_ref(dataset_id: str) -> bigquery.dataset.DatasetReference:
            if dataset_id == dataset.dataset_id:
                return dataset
            if dataset_id == dataset_2.dataset_id:
                return dataset_2
            raise ValueError(f"No dataset for id: {dataset_id}")

        self.mock_client.dataset_ref_for_id.side_effect = get_dataset_ref

        def mock_dataset_exists(dataset_ref: bigquery.dataset.DatasetReference) -> bool:
            if dataset_ref == dataset:
                return True
            return False

        self.mock_client.dataset_exists.side_effect = mock_dataset_exists

        views_to_update = [view_builder.build() for view_builder in mock_view_builders]

        dag_walker = BigQueryViewDagWalker(views_to_update)

        managed_views_map = (
            get_managed_view_and_materialized_table_addresses_by_dataset(dag_walker)
        )

        with self.assertLogs() as captured_log:
            cleanup_datasets_and_delete_unmanaged_views(
                self.mock_client, managed_views_map, dry_run=False
            )
        self.assertEqual(len(captured_log.records), 1)
        # check that there is only one log message
        self.assertEqual(
            captured_log.records[0].getMessage(),
            "Dataset dataset_unmanaged isn't being managed and no longer exists in BigQuery. It can be safely removed from the list DATASETS_THAT_HAVE_EVER_BEEN_MANAGED.",
        )
        self.mock_client.delete_dataset.assert_not_called()
        self.mock_client.delete_table.assert_not_called()
    def test_cleanup_datasets_and_delete_unmanaged_views_unmanaged_dataset(
        self, mock_get_datasets_that_have_ever_been_managed: mock.MagicMock
    ) -> None:
        mock_get_datasets_that_have_ever_been_managed.return_value = {
            "dataset_1",
            "bogus_dataset",
        }

        dataset = bigquery.dataset.DatasetReference(self.project_id, "dataset_1")
        dataset_2 = bigquery.dataset.DatasetReference(self.project_id, "bogus_dataset")

        sample_views = [
            {
                "view_id": "my_fake_view",
                "view_query": "SELECT NULL LIMIT 0",
            }
        ]
        mock_view_builders = [
            SimpleBigQueryViewBuilder(
                dataset_id="dataset_1",
                description=f"{view['view_id']} description",
                view_query_template="a",
                should_materialize=False,
                materialized_address_override=None,
                should_build_predicate=None,
                **view,
            )
            for view in sample_views
        ]

        mock_table_resource_ds_1_table_1 = {
            "tableReference": {
                "projectId": self.project_id,
                "datasetId": "dataset_1",
                "tableId": "my_fake_view",
            },
        }

        self.mock_client.list_tables.return_value = [
            bigquery.table.TableListItem(mock_table_resource_ds_1_table_1),
        ]

        def get_dataset_ref(dataset_id: str) -> bigquery.dataset.DatasetReference:
            if dataset_id == dataset.dataset_id:
                return dataset
            if dataset_id == dataset_2.dataset_id:
                return dataset_2
            raise ValueError(f"No dataset for id: {dataset_id}")

        self.mock_client.dataset_ref_for_id.side_effect = get_dataset_ref
        self.mock_client.dataset_exists.return_value = True

        views_to_update = [view_builder.build() for view_builder in mock_view_builders]

        dag_walker = BigQueryViewDagWalker(views_to_update)

        managed_views_map = (
            get_managed_view_and_materialized_table_addresses_by_dataset(dag_walker)
        )

        cleanup_datasets_and_delete_unmanaged_views(
            self.mock_client, managed_views_map, dry_run=False
        )

        self.mock_client.delete_dataset.assert_called_with(
            self.mock_client.dataset_ref_for_id("bogus_dataset"), delete_contents=True
        )
        self.mock_client.list_tables.assert_called()
        self.mock_client.delete_table.assert_not_called()