def _get_columns_by_file(
        state_code: str,
        project_id: str) -> Dict[str, List[RawTableColumnInfo]]:
    """Creates a list of RawTableColumnInfo for each raw file in a given state"""
    columns_by_file: Dict[str, List[RawTableColumnInfo]] = {}

    raw_data_dataset = f"{state_code.lower()}_raw_data"

    query_string = f"""
SELECT
 * EXCEPT(is_generated, generation_expression, is_stored, is_updatable)
FROM
 `{project_id}.{raw_data_dataset}.INFORMATION_SCHEMA.COLUMNS`
ORDER BY
  table_name ASC, ordinal_position ASC
"""

    bq_client = BigQueryClientImpl()
    query_job = bq_client.run_query_async(query_string)
    for row in query_job:
        column_name = row["column_name"]
        if column_name in {"file_id", "update_datetime"}:
            continue

        file_name = row["table_name"]
        is_datetime = row["data_type"].upper() == "DATETIME"

        if file_name not in columns_by_file:
            columns_by_file[file_name] = []

        column_info = RawTableColumnInfo(name=column_name,
                                         is_datetime=is_datetime,
                                         description="TKTK")
        columns_by_file[file_name].append(column_info)

    return columns_by_file
Exemple #2
0
def compare_raw_data_between_projects(
    region_code: str,
    source_project_id: str = environment.GCP_PROJECT_STAGING,
    comparison_project_id: str = environment.GCP_PROJECT_PRODUCTION,
) -> List[str]:
    """Compares the raw data between staging and production for a given region."""
    logging.info(
        "**** Ensuring all raw data for [%s] in [%s] also exists in [%s] ****",
        region_code.upper(),
        source_project_id,
        comparison_project_id,
    )

    raw_file_config = DirectIngestRegionRawFileConfig(region_code)

    bq_client = BigQueryClientImpl(project_id=source_project_id)
    dataset_id = DirectIngestRawFileImportManager.raw_tables_dataset_for_region(
        region_code)
    source_dataset = bq_client.dataset_ref_for_id(dataset_id)

    query_jobs: Dict[str, bigquery.QueryJob] = {}
    for file_tag, file_config in raw_file_config.raw_file_configs.items():
        if (not bq_client.table_exists(source_dataset, file_tag)
                or file_config.is_undocumented
                or not file_config.primary_key_cols):
            continue

        columns = ", ".join(
            [column.name for column in file_config.available_columns])

        query_job = bq_client.run_query_async(
            query_str=COMPARISON_TEMPLATE.format(
                source_project_id=source_project_id,
                comparison_project_id=comparison_project_id,
                raw_data_dataset_id=dataset_id,
                raw_data_table_id=file_tag,
                columns=columns,
            ))
        query_jobs[file_tag] = query_job

    table_column_width = min(
        max(len(tag) for tag in raw_file_config.raw_file_configs), 30)

    failed_tables: List[str] = []
    for file_tag in sorted(raw_file_config.raw_file_tags):
        justified_name = file_tag.ljust(table_column_width)

        if file_tag not in query_jobs:
            # This file did not exist in the project that is the source of truth.
            continue

        query_job = query_jobs[file_tag]
        try:
            rows = query_job.result()
        except exceptions.NotFound:
            logging.warning(
                "%s | Missing table %s.%s.%s",
                justified_name,
                comparison_project_id,
                dataset_id,
                file_tag,
            )
            failed_tables.append(file_tag)
            continue

        counts: List[Tuple[datetime.datetime,
                           int]] = [row.values() for row in rows]

        if counts:
            logging.warning(
                "%s | Missing data in the %s table",
                justified_name,
                comparison_project_id,
            )
            for update_datetime, num_missing in counts:
                logging.warning("\t%ss: %d", update_datetime.isoformat(),
                                num_missing)
            failed_tables.append(file_tag)
        else:
            logging.info(
                "%s | %s contains all of the data from %s",
                justified_name,
                comparison_project_id,
                source_project_id,
            )

    return failed_tables