def main(*, source_project_id, source_dataset_id, destination_project_id,
         destination_dataset_id):
    """Copies all views from the source_project_id.source_dataset_id to the
    destination_project_id.destination_dataset_id."""

    # Construct a BigQuery client with the source_project_id
    source_client = BigQueryClientImpl(project_id=source_project_id)

    # Construct a BigQuery client with the destination_project_id
    destination_client = BigQueryClientImpl(project_id=destination_project_id)

    destination_dataset = bigquery.DatasetReference(destination_project_id,
                                                    destination_dataset_id)

    tables_in_source_dataset = source_client.list_tables(source_dataset_id)

    for table_ref in tables_in_source_dataset:
        table = source_client.get_table(
            source_client.dataset_ref_for_id(table_ref.dataset_id),
            table_ref.table_id)

        # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the
        # destination dataset
        if table.view_query and not destination_client.table_exists(
                destination_dataset, table_id=table.table_id):
            # Retrieve all of the information about the view
            source_client.copy_view(
                view=BigQueryView(dataset_id=table_ref.dataset_id,
                                  view_id=table.table_id,
                                  view_query_template=table.view_query),
                destination_client=destination_client,
                destination_dataset_ref=destination_dataset)
def load_from_temp_to_permanent_table(bq_client: BigQueryClientImpl,
                                      project_id: str) -> None:
    """Query temporary table and persist view to permanent table"""
    num_rows_before = bq_client.get_table(
        dataset_ref=bigquery.DatasetReference(
            project=project_id,
            dataset_id=DATASET_ID,
        ),
        table_id=FINAL_DESTINATION_TABLE,
    ).num_rows

    insert_job = bq_client.insert_into_table_from_query(
        destination_dataset_id=DATASET_ID,
        destination_table_id=FINAL_DESTINATION_TABLE,
        query=INSERT_QUERY_TEMPLATE.format(
            project_id=project_id,
            dataset_id=DATASET_ID,
            temp_table=TEMP_DESTINATION_TABLE,
            final_table=FINAL_DESTINATION_TABLE,
        ),
        write_disposition=WriteDisposition.WRITE_APPEND,
    )

    insert_job_result = insert_job.result()

    logging.info(
        "Loaded [%d] non-duplicate rows into table [%s]",
        (insert_job_result.total_rows - num_rows_before),
        FINAL_DESTINATION_TABLE,
    )

    bq_client.delete_table(dataset_id=DATASET_ID,
                           table_id=TEMP_DESTINATION_TABLE)
    def columns(self) -> List[str]:
        if self._columns is None:
            bq_client = BigQueryClientImpl()
            t = bq_client.get_table(
                bq_client.dataset_ref_for_id(self.dataset_id), self.table_id)
            self._columns = [col.name for col in t.schema]

        return self._columns
Beispiel #4
0
def copy_bq_views(
    source_project_id: str,
    source_dataset_id: str,
    destination_project_id: str,
    destination_dataset_id: str,
) -> None:
    """Copies all views from the source_project_id.source_dataset_id to the
    destination_project_id.destination_dataset_id."""

    # Construct a BigQuery client with the source_project_id
    source_client = BigQueryClientImpl(project_id=source_project_id)

    # Construct a BigQuery client with the destination_project_id
    destination_client = BigQueryClientImpl(project_id=destination_project_id)
    destination_dataset = bigquery.DatasetReference(destination_project_id,
                                                    destination_dataset_id)
    tables_in_source_dataset = source_client.list_tables(source_dataset_id)

    for table_ref in tables_in_source_dataset:
        table = source_client.get_table(
            source_client.dataset_ref_for_id(table_ref.dataset_id),
            table_ref.table_id)
        view_query = table.view_query

        # Only copy this view if there is a view_query to replicate and the view doesn't already exist in the
        # destination dataset
        if view_query and not destination_client.table_exists(
                destination_dataset, table_id=table.table_id):
            # Remove any references to the source_project_id from the view_query
            updated_view_query = view_query.replace(source_project_id,
                                                    "{project_id}")

            # Retrieve all of the information about the view
            source_client.copy_view(
                view=BigQueryView(
                    project_id=destination_project_id,
                    dataset_id=destination_dataset_id,
                    view_id=table.table_id,
                    description=table.description,
                    view_query_template=updated_view_query,
                ),
                destination_client=destination_client,
                destination_dataset_ref=destination_dataset,
            )
Beispiel #5
0
def compare_dataflow_output_to_sandbox(
    sandbox_dataset_prefix: str,
    job_name_to_compare: str,
    base_output_job_id: str,
    sandbox_output_job_id: str,
    additional_columns_to_compare: List[str],
    allow_overwrite: bool = False,
) -> None:
    """Compares the output for all metrics produced by the daily pipeline job with the given |job_name_to_compare|
    between the output from the |base_output_job_id| job in the dataflow_metrics dataset and the output from the
    |sandbox_output_job_id| job in the sandbox dataflow dataset."""
    bq_client = BigQueryClientImpl()
    sandbox_dataflow_dataset_id = (sandbox_dataset_prefix + "_" +
                                   DATAFLOW_METRICS_DATASET)

    sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix +
                                            "_dataflow_comparison_output")
    sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id(
        sandbox_comparison_output_dataset_id)

    if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any(
            bq_client.list_tables(sandbox_comparison_output_dataset_id)):
        if not allow_overwrite:
            if __name__ == "__main__":
                logging.error(
                    "Dataset %s already exists in project %s. To overwrite, set --allow_overwrite.",
                    sandbox_comparison_output_dataset_id,
                    bq_client.project_id,
                )
                sys.exit(1)
            else:
                raise ValueError(
                    f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: "
                    f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}."
                )
        else:
            # Clean up the existing tables in the dataset
            for table in bq_client.list_tables(
                    sandbox_comparison_output_dataset_id):
                bq_client.delete_table(table.dataset_id, table.table_id)

    bq_client.create_dataset_if_necessary(
        sandbox_comparison_output_dataset_ref,
        TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS)

    query_jobs: List[Tuple[QueryJob, str]] = []

    pipelines = YAMLDict.from_path(PRODUCTION_TEMPLATES_PATH).pop_dicts(
        "daily_pipelines")

    for pipeline in pipelines:
        if pipeline.pop("job_name", str) == job_name_to_compare:
            pipeline_metric_types = pipeline.peek_optional("metric_types", str)

            if not pipeline_metric_types:
                raise ValueError(
                    f"Pipeline job {job_name_to_compare} missing required metric_types attribute."
                )

            metric_types_for_comparison = pipeline_metric_types.split()

            for metric_class, metric_table in DATAFLOW_METRICS_TO_TABLES.items(
            ):
                metric_type_value = DATAFLOW_TABLES_TO_METRIC_TYPES[
                    metric_table].value

                if metric_type_value in metric_types_for_comparison:
                    comparison_query = _query_for_metric_comparison(
                        bq_client,
                        base_output_job_id,
                        sandbox_output_job_id,
                        sandbox_dataflow_dataset_id,
                        metric_class,
                        metric_table,
                        additional_columns_to_compare,
                    )

                    query_job = bq_client.create_table_from_query_async(
                        dataset_id=sandbox_comparison_output_dataset_id,
                        table_id=metric_table,
                        query=comparison_query,
                        overwrite=True,
                    )

                    # Add query job to the list of running jobs
                    query_jobs.append((query_job, metric_table))

    for query_job, output_table_id in query_jobs:
        # Wait for the insert job to complete before looking for the table
        query_job.result()

        output_table = bq_client.get_table(
            sandbox_comparison_output_dataset_ref, output_table_id)

        if output_table.num_rows == 0:
            # If there are no rows in the output table, then the output was identical
            bq_client.delete_table(sandbox_comparison_output_dataset_id,
                                   output_table_id)

    metrics_with_different_output = peekable(
        bq_client.list_tables(sandbox_comparison_output_dataset_id))

    logging.info(
        "\n*************** DATAFLOW OUTPUT COMPARISON RESULTS ***************\n"
    )

    if metrics_with_different_output:
        for metric_table in metrics_with_different_output:
            # This will always be true, and is here to silence mypy warnings
            assert isinstance(metric_table, bigquery.table.TableListItem)

            logging.warning(
                "Dataflow output differs for metric %s. See %s.%s for diverging rows.",
                metric_table.table_id,
                sandbox_comparison_output_dataset_id,
                metric_table.table_id,
            )
    else:
        logging.info(
            "Dataflow output identical. Deleting dataset %s.",
            sandbox_comparison_output_dataset_ref.dataset_id,
        )
        bq_client.delete_dataset(sandbox_comparison_output_dataset_ref,
                                 delete_contents=True)
Beispiel #6
0
def _view_output_comparison_job(
    bq_client: BigQueryClientImpl,
    view_builder: MetricBigQueryViewBuilder,
    base_view_id: str,
    base_dataset_id: str,
    sandbox_dataset_id: str,
    sandbox_comparison_output_dataset_id: str,
    check_determinism: bool,
    allow_schema_changes: bool,
) -> Tuple[bigquery.QueryJob, str]:
    """Builds and executes the query that compares the base and sandbox views. Returns a tuple with the the QueryJob and
    the table_id where the output will be written to in the sandbox_comparison_output_dataset_id dataset."""
    base_dataset_ref = bq_client.dataset_ref_for_id(base_dataset_id)
    sandbox_dataset_ref = bq_client.dataset_ref_for_id(sandbox_dataset_id)
    output_table_id = f"{view_builder.dataset_id}--{base_view_id}"

    if check_determinism:
        # Compare all columns
        columns_to_compare = ["*"]
        preserve_column_types = True
    else:
        # Columns in deployed view
        deployed_base_view = bq_client.get_table(base_dataset_ref,
                                                 view_builder.view_id)
        # If there are nested columns in the deployed view then we can't allow column type changes
        preserve_column_types = _table_contains_nested_columns(
            deployed_base_view)
        base_columns_to_compare = set(field.name
                                      for field in deployed_base_view.schema)

        # Columns in sandbox view
        deployed_sandbox_view = bq_client.get_table(sandbox_dataset_ref,
                                                    view_builder.view_id)
        if not preserve_column_types:
            # If there are nested columns in the sandbox view then we can't allow column type changes
            preserve_column_types = _table_contains_nested_columns(
                deployed_sandbox_view)

        sandbox_columns_to_compare = set(
            field.name for field in deployed_sandbox_view.schema)

        if allow_schema_changes:
            # Only compare columns in both views
            shared_columns = base_columns_to_compare.intersection(
                sandbox_columns_to_compare)
            columns_to_compare = list(shared_columns)
        else:
            if base_columns_to_compare != sandbox_columns_to_compare:
                raise ValueError(
                    f"Schemas of the {base_dataset_id}.{base_view_id} deployed and"
                    f" sandbox views do not match. If this is expected, please run again"
                    f"with the --allow_schema_changes flag.")
            columns_to_compare = list(base_columns_to_compare)

    # Only include dimensions in both views unless we are checking the determinism of the local
    # view
    metric_dimensions = [
        dimension for dimension in view_builder.dimensions
        if dimension in columns_to_compare or check_determinism
    ]

    if not preserve_column_types:
        # Cast all columns to strings to guard against column types that may have changed
        columns_to_compare = [
            f"CAST({col} AS STRING) as {col}" for col in columns_to_compare
        ]

    base_dataset_id_for_query = (sandbox_dataset_id if check_determinism else
                                 view_builder.dataset_id)

    diff_query = OUTPUT_COMPARISON_TEMPLATE.format(
        project_id=bq_client.project_id,
        base_dataset_id=base_dataset_id_for_query,
        sandbox_dataset_id=sandbox_dataset_id,
        view_id=base_view_id,
        columns_to_compare=", ".join(columns_to_compare),
        dimensions=", ".join(metric_dimensions),
    )

    return (
        bq_client.create_table_from_query_async(
            dataset_id=sandbox_comparison_output_dataset_id,
            table_id=output_table_id,
            query=diff_query,
            overwrite=True,
        ),
        output_table_id,
    )
Beispiel #7
0
def compare_metric_view_output_to_sandbox(
    sandbox_dataset_prefix: str,
    load_sandbox_views: bool,
    check_determinism: bool,
    allow_schema_changes: bool,
    dataset_id_filters: Optional[List[str]],
) -> None:
    """Compares the output of all deployed metric views to the output of the corresponding views in the sandbox
    dataset."""
    if load_sandbox_views:
        logging.info(
            "Loading views into sandbox datasets prefixed with %s",
            sandbox_dataset_prefix,
        )
        load_views_to_sandbox(sandbox_dataset_prefix)

    bq_client = BigQueryClientImpl()
    sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix +
                                            "_metric_view_comparison_output")
    sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id(
        sandbox_comparison_output_dataset_id)

    if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any(
            bq_client.list_tables(sandbox_comparison_output_dataset_id)):
        raise ValueError(
            f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: "
            f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}.")

    bq_client.create_dataset_if_necessary(
        sandbox_comparison_output_dataset_ref,
        TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS)

    query_jobs: List[Tuple[QueryJob, str]] = []
    skipped_views: List[str] = []

    for view_builders in VIEW_BUILDERS_BY_NAMESPACE.values():
        for view_builder in view_builders:
            # Only compare output of metric views
            if not isinstance(view_builder, MetricBigQueryViewBuilder):
                continue

            base_dataset_id = view_builder.dataset_id

            if dataset_id_filters and base_dataset_id not in dataset_id_filters:
                continue

            if view_builder in VIEW_BUILDERS_WITH_KNOWN_NOT_DETERMINISTIC_OUTPUT:
                logging.warning(
                    "View %s.%s has known non-deterministic output. Skipping output comparison.",
                    view_builder.dataset_id,
                    view_builder.view_id,
                )
                skipped_views.append(
                    f"{view_builder.dataset_id}.{view_builder.view_id}")
                continue

            sandbox_dataset_id = sandbox_dataset_prefix + "_" + base_dataset_id

            if not bq_client.dataset_exists(
                    bq_client.dataset_ref_for_id(sandbox_dataset_id)):
                raise ValueError(
                    f"Trying to compare output to a sandbox dataset that does not exist: "
                    f"{bq_client.project_id}.{sandbox_dataset_id}")

            base_dataset_ref = bq_client.dataset_ref_for_id(base_dataset_id)
            base_view_id = (view_builder.build().materialized_view_table_id
                            if view_builder.should_materialize
                            and not check_determinism else
                            view_builder.view_id)

            if not base_view_id:
                raise ValueError(
                    "Unexpected empty base_view_id. view_id or materialized_view_table_id unset"
                    f"for {view_builder}.")

            if not check_determinism and not bq_client.table_exists(
                    base_dataset_ref, base_view_id):
                logging.warning(
                    "View %s.%s does not exist. Skipping output comparison.",
                    base_dataset_ref.dataset_id,
                    base_view_id,
                )
                skipped_views.append(f"{base_dataset_id}.{base_view_id}")
                continue

            if not bq_client.table_exists(
                    bq_client.dataset_ref_for_id(sandbox_dataset_id),
                    base_view_id):
                logging.warning(
                    "View %s.%s does not exist in sandbox. Skipping output comparison.",
                    sandbox_dataset_id,
                    base_view_id,
                )
                skipped_views.append(f"{sandbox_dataset_id}.{base_view_id}")
                continue
            query_job, output_table_id = _view_output_comparison_job(
                bq_client,
                view_builder,
                base_view_id,
                base_dataset_id,
                sandbox_dataset_id,
                sandbox_comparison_output_dataset_id,
                check_determinism,
                allow_schema_changes,
            )

            # Add query job to the list of running jobs
            query_jobs.append((query_job, output_table_id))

    for query_job, output_table_id in query_jobs:
        # Wait for the insert job to complete before looking for the table
        query_job.result()

        output_table = bq_client.get_table(
            sandbox_comparison_output_dataset_ref, output_table_id)

        if output_table.num_rows == 0:
            # If there are no rows in the output table, then the view output was identical
            bq_client.delete_table(sandbox_comparison_output_dataset_id,
                                   output_table_id)

    views_with_different_output = bq_client.list_tables(
        sandbox_comparison_output_dataset_id)
    views_with_different_output = peekable(views_with_different_output)

    logging.info(
        "\n*************** METRIC VIEW OUTPUT RESULTS ***************\n")

    if dataset_id_filters:
        logging.info(
            "Only compared metric view output for the following datasets: \n %s \n",
            dataset_id_filters,
        )

    logging.info(
        "Skipped output comparison for the following metric views: \n %s \n",
        skipped_views,
    )

    if views_with_different_output:
        for view in views_with_different_output:
            base_dataset_id, base_view_id = view.table_id.split("--")

            logging.warning(
                "View output differs for view %s.%s. See %s.%s for diverging rows.",
                base_dataset_id,
                base_view_id,
                sandbox_comparison_output_dataset_id,
                view.table_id,
            )
    else:
        output_message = (
            "identical between deployed views and sandbox datasets"
            if not check_determinism else "deterministic")
        logging.info(
            "View output %s. Deleting dataset %s.",
            output_message,
            sandbox_comparison_output_dataset_ref.dataset_id,
        )
        bq_client.delete_dataset(sandbox_comparison_output_dataset_ref,
                                 delete_contents=True)