def _get_columns_by_file( state_code: str, project_id: str) -> Dict[str, List[RawTableColumnInfo]]: """Creates a list of RawTableColumnInfo for each raw file in a given state""" columns_by_file: Dict[str, List[RawTableColumnInfo]] = {} raw_data_dataset = f"{state_code.lower()}_raw_data" query_string = f""" SELECT * EXCEPT(is_generated, generation_expression, is_stored, is_updatable) FROM `{project_id}.{raw_data_dataset}.INFORMATION_SCHEMA.COLUMNS` ORDER BY table_name ASC, ordinal_position ASC """ bq_client = BigQueryClientImpl() query_job = bq_client.run_query_async(query_string) for row in query_job: column_name = row["column_name"] if column_name in {"file_id", "update_datetime"}: continue file_name = row["table_name"] is_datetime = row["data_type"].upper() == "DATETIME" if file_name not in columns_by_file: columns_by_file[file_name] = [] column_info = RawTableColumnInfo(name=column_name, is_datetime=is_datetime, description="TKTK") columns_by_file[file_name].append(column_info) return columns_by_file
def compare_raw_data_between_projects( region_code: str, source_project_id: str = environment.GCP_PROJECT_STAGING, comparison_project_id: str = environment.GCP_PROJECT_PRODUCTION, ) -> List[str]: """Compares the raw data between staging and production for a given region.""" logging.info( "**** Ensuring all raw data for [%s] in [%s] also exists in [%s] ****", region_code.upper(), source_project_id, comparison_project_id, ) raw_file_config = DirectIngestRegionRawFileConfig(region_code) bq_client = BigQueryClientImpl(project_id=source_project_id) dataset_id = DirectIngestRawFileImportManager.raw_tables_dataset_for_region( region_code) source_dataset = bq_client.dataset_ref_for_id(dataset_id) query_jobs: Dict[str, bigquery.QueryJob] = {} for file_tag, file_config in raw_file_config.raw_file_configs.items(): if (not bq_client.table_exists(source_dataset, file_tag) or file_config.is_undocumented or not file_config.primary_key_cols): continue columns = ", ".join( [column.name for column in file_config.available_columns]) query_job = bq_client.run_query_async( query_str=COMPARISON_TEMPLATE.format( source_project_id=source_project_id, comparison_project_id=comparison_project_id, raw_data_dataset_id=dataset_id, raw_data_table_id=file_tag, columns=columns, )) query_jobs[file_tag] = query_job table_column_width = min( max(len(tag) for tag in raw_file_config.raw_file_configs), 30) failed_tables: List[str] = [] for file_tag in sorted(raw_file_config.raw_file_tags): justified_name = file_tag.ljust(table_column_width) if file_tag not in query_jobs: # This file did not exist in the project that is the source of truth. continue query_job = query_jobs[file_tag] try: rows = query_job.result() except exceptions.NotFound: logging.warning( "%s | Missing table %s.%s.%s", justified_name, comparison_project_id, dataset_id, file_tag, ) failed_tables.append(file_tag) continue counts: List[Tuple[datetime.datetime, int]] = [row.values() for row in rows] if counts: logging.warning( "%s | Missing data in the %s table", justified_name, comparison_project_id, ) for update_datetime, num_missing in counts: logging.warning("\t%ss: %d", update_datetime.isoformat(), num_missing) failed_tables.append(file_tag) else: logging.info( "%s | %s contains all of the data from %s", justified_name, comparison_project_id, source_project_id, ) return failed_tables