Example #1
0
def export_dashboard_data_to_cloud_storage(bucket: str):
    """Exports data needed by the dashboard to the cloud storage bucket.

    Args:
        bucket: The cloud storage location where the exported data should go.
    """
    view_manager.create_dataset_and_update_views(view_config.VIEWS_TO_UPDATE)

    bq_client = BigQueryClientImpl()
    views_to_materialize = dashboard_export_config.VIEWS_TO_MATERIALIZE_FOR_DASHBOARD_EXPORT
    views_to_export = dashboard_export_config.VIEWS_TO_EXPORT

    # This step has to happen before the export because some views in views_to_export depend on the materialized
    # version of a view
    for view in views_to_materialize:
        bq_client.materialize_view_to_table(view)

    bq_client.export_query_results_to_cloud_storage([
        ExportQueryConfig.from_view_query(
            view=view,
            view_filter_clause=f" WHERE state_code = '{state}'",
            intermediate_table_name=f"{view.view_id}_table_{state}",
            output_uri=f"gs://{bucket}/{state}/{view.view_id}.json",
            output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON)
        for state in dashboard_export_config.STATES_TO_EXPORT
        for view in views_to_export
    ])
Example #2
0
 def test_export_query_results_to_cloud_storage(self):
     """export_query_results_to_cloud_storage creates the table from the view query and
     exports the table."""
     bucket = self.mock_project_id + '-bucket'
     query_job = futures.Future()
     query_job.set_result([])
     extract_job = futures.Future()
     extract_job.set_result(None)
     self.mock_client.query.return_value = query_job
     self.mock_client.extract_table.return_value = extract_job
     self.bq_client.export_query_results_to_cloud_storage([
         ExportQueryConfig.from_view_query(
             view=self.mock_view,
             view_filter_clause='WHERE x = y',
             intermediate_table_name=self.mock_table_id,
             output_uri=f'gs://{bucket}/view.json',
             output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON
         )
     ])
     self.mock_client.query.assert_called()
     self.mock_client.extract_table.assert_called()
     self.mock_client.delete_table.assert_called_with(
         bigquery.DatasetReference(self.mock_project_id,
                                   self.mock_view.dataset_id).table(
                                       self.mock_table_id))
Example #3
0
def run_export(dry_run: bool, state_code: str, target_bucket: str) -> None:
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to CSV files with the same names as the tables to the given GCS bucket."""
    today = datetime.date.today()

    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f"Dataset {dataset_ref.dataset_id} does not exist")

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    export_configs = []
    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table,
                                                    [state_code.upper()])
        logging.info(export_query)

        if not export_query:
            continue

        export_dir = gcs_export_directory(target_bucket, today,
                                          state_code.lower())
        export_file_name = f"{table.table_id}_{today.isoformat()}_export.csv"
        file = GcsfsFilePath.from_directory_and_file_name(
            export_dir, export_file_name)
        output_uri = file.uri()

        export_config = ExportQueryConfig(
            query=export_query,
            query_parameters=[],
            intermediate_dataset_id="export_temporary_tables",
            intermediate_table_name=
            f"{dataset_ref.dataset_id}_{table.table_id}_{state_code.lower()}",
            output_uri=output_uri,
            output_format=bigquery.DestinationFormat.CSV,
        )
        export_configs.append(export_config)
        if dry_run:
            logging.info(
                "[DRY RUN] Created export configuration to export table to GCS: %s",
                export_config,
            )
        else:
            logging.info(
                "Created export configuration to export table to GCS: %s",
                export_config)

    if dry_run:
        logging.info("[DRY RUN] Exporting [%d] tables to GCS",
                     len(export_configs))
    else:
        logging.info("Exporting [%d] tables to GCS", len(export_configs))
        big_query_client.export_query_results_to_cloud_storage(
            export_configs, print_header=True)
Example #4
0
 def test_export_query_results_to_cloud_storage_no_table(self):
     bucket = self.mock_project_id + '-bucket'
     self.mock_client.get_table.side_effect = exceptions.NotFound('!')
     with self.assertLogs(level='WARNING'):
         self.bq_client.export_query_results_to_cloud_storage([
             ExportQueryConfig.from_view_query(
                 view=self.mock_view,
                 view_filter_clause='WHERE x = y',
                 intermediate_table_name=self.mock_table_id,
                 output_uri=f'gs://{bucket}/view.json',
                 output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON)
         ])
Example #5
0
    def export_view_for_args(self, ingest_view_export_args: GcsfsIngestViewExportArgs) -> bool:
        """Performs an export of a single ingest view with date bounds specified in the provided args."""
        if not self.region.are_ingest_view_exports_enabled_in_env():
            raise ValueError(f'Ingest view exports not enabled for region [{self.region.region_code}]')

        metadata = self.file_metadata_manager.get_ingest_view_metadata_for_export_job(ingest_view_export_args)

        if not metadata:
            raise ValueError(f'Found no metadata for the given job args: [{ingest_view_export_args}].')

        if metadata.export_time:
            logging.warning('Already exported view for args [%s] - returning.', ingest_view_export_args)
            return False

        logging.info('Beginning export for view tag [%s] with args: [%s].',
                     ingest_view_export_args.ingest_view_name, ingest_view_export_args)

        query, query_params = self._generate_query_with_params(
            self.ingest_views_by_tag[ingest_view_export_args.ingest_view_name],
            ingest_view_export_args)

        logging.info('Generated export query [%s]', query)
        logging.info('Generated export query params [%s]', query_params)

        output_path = self._generate_output_path(ingest_view_export_args, metadata)

        logging.info('Generated output path [%s]', output_path.uri())

        if not metadata.normalized_file_name:
            self.file_metadata_manager.register_ingest_view_export_file_name(metadata, output_path)

        ingest_view = self.ingest_views_by_tag[ingest_view_export_args.ingest_view_name]
        export_configs = [
            ExportQueryConfig(
                query=query,
                query_parameters=query_params,
                intermediate_dataset_id=ingest_view.dataset_id,
                intermediate_table_name=f'{ingest_view_export_args.ingest_view_name}_latest_export',
                output_uri=output_path.uri(),
                output_format=bigquery.DestinationFormat.CSV
            )
        ]

        logging.info('Starting export to cloud storage.')
        self.big_query_client.export_query_results_to_cloud_storage(export_configs)
        logging.info('Export to cloud storage complete.')

        self.file_metadata_manager.mark_ingest_view_exported(metadata)
        return True
def export_view_data_to_cloud_storage():
    """Exports data in BigQuery views to cloud storage buckets."""
    view_builders_for_views_to_update = view_config.VIEW_BUILDERS_FOR_VIEWS_TO_UPDATE
    view_manager.create_dataset_and_update_views_for_view_builders(view_builders_for_views_to_update)

    bq_client = BigQueryClientImpl()
    project_id = bq_client.project_id

    view_builders_to_materialize = view_config.VIEW_BUILDERS_FOR_VIEWS_TO_MATERIALIZE_FOR_DASHBOARD_EXPORT

    # This step has to happen before the export because some views in views_to_export depend on the materialized
    # version of a view
    for view_builder in view_builders_to_materialize:
        view = view_builder.build()
        bq_client.materialize_view_to_table(view)

    datasets_to_export = view_config.DATASETS_STATES_AND_VIEW_BUILDERS_TO_EXPORT.keys()

    for dataset_id in datasets_to_export:
        states_and_view_builders_to_export = view_config.DATASETS_STATES_AND_VIEW_BUILDERS_TO_EXPORT.get(dataset_id)
        output_uri_template = view_config.OUTPUT_URI_TEMPLATE_FOR_DATASET_EXPORT.get(dataset_id)

        if not states_and_view_builders_to_export or not output_uri_template:
            raise ValueError(f"Trying to export views from an unsupported dataset: {dataset_id}")

        views_to_export = [
            ExportQueryConfig.from_view_query(
                view=view,
                view_filter_clause=f" WHERE state_code = '{state_code}'",
                intermediate_table_name=(
                    f"{view.view_id if view.materialized_view_table_id is None else view.materialized_view_table_id}"
                    f"_table_{state_code}"),
                output_uri=output_uri_template.format(
                    project_id=project_id,
                    state_code=state_code,
                    view_id=view.view_id if view.materialized_view_table_id is None else view.materialized_view_table_id
                ),
                output_format=bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON)
            for state_code, view_builders in states_and_view_builders_to_export.items()
            for view in [view_builder.build() for view_builder in view_builders]
        ]

        bq_client.export_query_results_to_cloud_storage(views_to_export)
    def export_view_for_args(
            self, ingest_view_export_args: GcsfsIngestViewExportArgs) -> bool:
        """Performs an Cloud Storage export of a single ingest view with date bounds specified in the provided args. If
        the provided args contain an upper and lower bound date, the exported view contains only the delta between the
        two dates. If only the upper bound is provided, then the exported view contains historical results up until the
        bound date.

        Note: In order to prevent resource exhaustion in BigQuery, the ultimate query in this method is broken down
        into distinct parts. This method first persists the results of historical queries for each given bound date
        (upper and lower) into temporary tables. The delta between those tables is then queried separately using
        SQL's `EXCEPT DISTINCT` and those final results are exported to Cloud Storage.
        """
        if not self.region.are_ingest_view_exports_enabled_in_env():
            raise ValueError(
                f'Ingest view exports not enabled for region [{self.region.region_code}]'
            )

        metadata = self.file_metadata_manager.get_ingest_view_metadata_for_export_job(
            ingest_view_export_args)

        if not metadata:
            raise ValueError(
                f'Found no metadata for the given job args: [{ingest_view_export_args}].'
            )

        if metadata.export_time:
            logging.warning('Already exported view for args [%s] - returning.',
                            ingest_view_export_args)
            return False

        output_path = self._generate_output_path(ingest_view_export_args,
                                                 metadata)
        logging.info('Generated output path [%s]', output_path.uri())

        if not metadata.normalized_file_name:
            self.file_metadata_manager.register_ingest_view_export_file_name(
                metadata, output_path)

        ingest_view = self.ingest_views_by_tag[
            ingest_view_export_args.ingest_view_name]
        single_date_table_ids = []
        single_date_table_export_jobs = []

        upper_bound_table_name = \
            f'{ingest_view_export_args.ingest_view_name}_' \
            f'{ingest_view_export_args.upper_bound_datetime_to_export.strftime(TABLE_NAME_DATE_FORMAT)}_' \
            f'upper_bound'
        export_job = self._generate_export_job_for_date(
            table_name=upper_bound_table_name,
            ingest_view=ingest_view,
            date_bound=ingest_view_export_args.upper_bound_datetime_to_export)
        single_date_table_ids.append(upper_bound_table_name)
        single_date_table_export_jobs.append(export_job)

        query = SELECT_SUBQUERY.format(
            project_id=self.big_query_client.project_id,
            dataset_id=ingest_view.dataset_id,
            table_name=upper_bound_table_name)

        if ingest_view_export_args.upper_bound_datetime_prev:
            lower_bound_table_name = \
                f'{ingest_view_export_args.ingest_view_name}_' \
                f'{ingest_view_export_args.upper_bound_datetime_prev.strftime(TABLE_NAME_DATE_FORMAT)}_' \
                f'lower_bound'
            export_job = self._generate_export_job_for_date(
                table_name=lower_bound_table_name,
                ingest_view=ingest_view,
                date_bound=ingest_view_export_args.upper_bound_datetime_prev)
            single_date_table_export_jobs.append(export_job)
            single_date_table_ids.append(lower_bound_table_name)

            filter_query = SELECT_SUBQUERY.format(
                project_id=self.big_query_client.project_id,
                dataset_id=ingest_view.dataset_id,
                table_name=lower_bound_table_name).rstrip().rstrip(';')
            query = query.rstrip().rstrip(';')
            query = f'(\n{query}\n) EXCEPT DISTINCT (\n{filter_query}\n);'

        query = DirectIngestPreProcessedIngestView.add_order_by_suffix(
            query=query, order_by_cols=ingest_view.order_by_cols)

        # Wait for completion of all async date queries
        for query_job in single_date_table_export_jobs:
            query_job.result()
        logging.info(
            'Completed loading results of individual date queries into intermediate tables.'
        )

        logging.info('Generated final export query [%s]', str(query))

        export_configs = [
            ExportQueryConfig(
                query=query,
                query_parameters=[],
                intermediate_dataset_id=ingest_view.dataset_id,
                intermediate_table_name=
                f'{ingest_view_export_args.ingest_view_name}_latest_export',
                output_uri=output_path.uri(),
                output_format=bigquery.DestinationFormat.CSV)
        ]

        logging.info('Starting export to cloud storage.')
        self.big_query_client.export_query_results_to_cloud_storage(
            export_configs=export_configs)
        logging.info('Export to cloud storage complete.')

        for table_id in single_date_table_ids:
            self.big_query_client.delete_table(
                dataset_id=ingest_view.dataset_id, table_id=table_id)
            logging.info('Deleted intermediate table [%s]', table_id)

        self.file_metadata_manager.mark_ingest_view_exported(metadata)

        return True