def setUp(self) -> None: super().setUp() view_builders = DirectIngestPreProcessedIngestViewCollector( get_region(STATE_CODE, is_direct_ingest=True), [] ).collect_view_builders() self.view_builder = one( view for view in view_builders if view.file_tag == "sci_incarceration_period" ) self.expected_result_columns = [ "control_number", "inmate_number", "sequence_number", "start_movement_date", "end_movement_date", "location", "start_sentence_status_code", "end_sentence_status_code", "start_parole_status_code", "end_parole_status_code", "start_movement_code", "end_movement_code", "start_is_new_revocation", "start_is_admin_edge", "end_is_admin_edge", "sentence_type", ]
def get_ingest_view_configs( region_code: str, ) -> List[DataDiscoveryStandardizedFileConfig]: """Collect ingest views for region; reads columns from their corresponding fixture csv""" if not StateCode.is_state_code(region_code): raise ValueError( f"Unknown region_code [{region_code}] received, must be a valid state code." ) region_code = region_code.lower() views = DirectIngestPreProcessedIngestViewCollector( get_region(region_code, True), []).collect_view_builders() configs = [] for view in views: try: # TODO(#6925) Infer columns from the mapping file rather than the fixture csv fixture_path = os.path.join( os.path.dirname(recidiviz.__file__), f"tests/ingest/direct/direct_ingest_fixtures/{region_code}/{view.ingest_view_name}.csv", ) with open(fixture_path, "r") as f: columns = f.readline().split(",") except FileNotFoundError: continue standardized_config = DataDiscoveryStandardizedFileConfig( file_tag=view.ingest_view_name, columns=columns, ) configs.append(standardized_config) return configs
def test_collect_and_build_ingest_view_builders( self, _name: str, project_id: str, environment: GCPEnvironment) -> None: with patch("recidiviz.utils.environment.get_gcp_environment", return_value=environment): with patch("recidiviz.utils.metadata.project_id", return_value=project_id): for region_code in self.region_dir_names: region = get_region( region_code, is_direct_ingest=True, region_module_override=self.region_module_override, ) with patch( "recidiviz.utils.metadata.project_id", return_value="recidiviz-456", ): controller = DirectIngestControllerFactory.build( ingest_bucket_path=self. primary_ingest_bucket_for_region(region), allow_unlaunched=True, ) builders = DirectIngestPreProcessedIngestViewCollector( region, controller.get_file_tag_rank_list() ).collect_view_builders() for builder in builders: builder.build()
def test_raw_files_yaml_parses_all_regions(self) -> None: for region_code in self.region_dir_names: region = get_region( region_code, is_direct_ingest=True, region_module_override=self.region_module_override, ) controller_class = region.get_ingestor_class() if not issubclass(controller_class, GcsfsDirectIngestController): continue builders = DirectIngestPreProcessedIngestViewCollector( region, controller_class.get_file_tag_rank_list() ).collect_view_builders() raw_file_manager = DirectIngestRegionRawFileConfig( region_code=region.region_code, region_module=self.region_module_override, ) if builders or raw_file_manager.raw_file_configs: if region.raw_data_bq_imports_enabled_env is not None: self.test.assertTrue(raw_file_manager.raw_file_configs) config_file_tags = set() for config in raw_file_manager.raw_file_configs.values(): self.test.assertTrue( config.file_tag not in config_file_tags, f"Multiple raw file configs defined with the same " f"file_tag [{config.file_tag}]", ) config_file_tags.add(config.file_tag)
def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]: """Generates documentation for all raw file configs for the given region and returns all of it as a combined string. Returns one Markdown-formatted string per raw file, mapped to its filename, as well as a header file with a table of contents. """ region_config = DirectIngestRegionRawFileConfig(region_code=region_code) sorted_file_tags = sorted(region_config.raw_file_tags) if StateCode.is_state_code(region_code): state_code = StateCode(region_code.upper()) state_name = state_code.get_state().name file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format( state_name=state_name, state_code_lower=state_code.value.lower() ) else: file_header = "" raw_file_configs = [ region_config.raw_file_configs[file_tag] for file_tag in sorted_file_tags ] config_paths_by_file_tag = { file_tag: file_config.file_path for file_tag, file_config in region_config.raw_file_configs.items() } file_tags_with_raw_file_configs = [ raw_file_config.file_tag for raw_file_config in raw_file_configs ] region = regions.get_region(region_code=region_code, is_direct_ingest=True) view_collector = DirectIngestPreProcessedIngestViewCollector(region, []) views_by_raw_file = self.get_referencing_views(view_collector) touched_configs = self._get_touched_raw_data_configs( region_config.yaml_config_file_dir ) raw_file_table = self._generate_raw_file_table( config_paths_by_file_tag, file_tags_with_raw_file_configs, views_by_raw_file, touched_configs, ) docs_per_file: Dict[str, str] = { f"{config.file_tag}.md": self._generate_docs_for_raw_config(config) for config in raw_file_configs } docs_per_file[STATE_RAW_DATA_FILE_HEADER_PATH] = ( file_header + "\n" + raw_file_table ) return docs_per_file
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.temp_output_directory_path = \ GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path()) ingest_job_file_type_filter = \ GcsfsDirectIngestFileType.INGEST_VIEW \ if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self.get_file_tag_rank_list(), ingest_job_file_type_filter) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl()) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()))
def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None: """Initialize the controller.""" self.cloud_task_manager = DirectIngestCloudTaskManagerImpl() self.ingest_instance = DirectIngestInstance.for_ingest_bucket( ingest_bucket_path) self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest( region_code=self.region.region_code, schema_type=self.system_level.schema_type(), ingest_instance=self.ingest_instance, ) self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.ingest_bucket_path = ingest_bucket_path self.storage_directory_path = ( gcsfs_direct_ingest_storage_directory_path_for_region( region_code=self.region_code(), system_level=self.system_level, ingest_instance=self.ingest_instance, )) self.temp_output_directory_path = ( gcsfs_direct_ingest_temporary_output_directory_path()) self.file_prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_bucket_path, self.get_file_tag_rank_list(), ) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code, ingest_database_name=self.ingest_database_key.db_name, ) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_bucket_path=self.ingest_bucket_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl(), ) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, output_bucket_name=self.ingest_bucket_path.bucket_name, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()), launched_file_tags=self.get_file_tag_rank_list(), ) self.ingest_instance_status_manager = DirectIngestInstanceStatusManager( self.region_code(), self.ingest_instance)
def test_collect_ingest_views(self): with local_project_id_override('project'): for region_code in self._get_existing_region_dir_names(): region = get_region(region_code, is_direct_ingest=True) controller_class = region.get_ingestor_class() if not issubclass(controller_class, GcsfsDirectIngestController): continue _ = DirectIngestPreProcessedIngestViewCollector( region, controller_class.get_file_tag_rank_list()).collect_views()
def get_referencing_views( view_collector: DirectIngestPreProcessedIngestViewCollector, ) -> Dict[str, List[str]]: """Generates a dictionary mapping raw files to ingest views that reference them""" views_by_raw_file = defaultdict(list) for builder in view_collector.collect_view_builders(): ingest_view = builder.build() dependency_configs = ingest_view.raw_table_dependency_configs for config in dependency_configs: views_by_raw_file[config.file_tag].append(ingest_view.file_tag) return views_by_raw_file
def generate_raw_file_docs_for_region(self, region_code: str) -> str: """Generates documentation for all raw file configs for the given region and returns all of it as a combined string.""" region_config = DirectIngestRegionRawFileConfig( region_code=region_code) sorted_file_tags = sorted(region_config.raw_file_tags) if StateCode.is_state_code(region_code): state_code = StateCode(region_code.upper()) state_name = state_code.get_state() file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format( state_name=state_name, state_code_lower=state_code.value.lower()) else: file_header = "" raw_file_configs = [ region_config.raw_file_configs[file_tag] for file_tag in sorted_file_tags ] config_paths_by_file_tag = { file_tag: file_config.file_path for file_tag, file_config in region_config.raw_file_configs.items() } file_tags_with_raw_file_configs = [ raw_file_config.file_tag for raw_file_config in raw_file_configs ] region = regions.get_region(region_code=region_code, is_direct_ingest=True) view_collector = DirectIngestPreProcessedIngestViewCollector( region, []) views_by_raw_file = self.get_referencing_views(view_collector) raw_file_table = self._generate_raw_file_table( config_paths_by_file_tag, file_tags_with_raw_file_configs, views_by_raw_file) docs_per_file = [ self._generate_docs_for_raw_config(config) for config in raw_file_configs ] return file_header + "\n" + raw_file_table + "\n" + "\n\n".join( docs_per_file)
def setUp(self) -> None: super().setUp() view_builders = DirectIngestPreProcessedIngestViewCollector( get_region(STATE_CODE, is_direct_ingest=True), []).collect_view_builders() self.view_builder = one(view for view in view_builders if view.file_tag == "person_external_ids") self.expected_result_columns = [ "recidiviz_master_person_id", "control_numbers", "inmate_numbers", "parole_numbers", ]
def test_collect_and_build_ingest_view_builders( self, _name: str, project_id: str, environment: GCPEnvironment ) -> None: with patch( "recidiviz.utils.environment.get_gcp_environment", return_value=environment ): with patch("recidiviz.utils.metadata.project_id", return_value=project_id): for region_code in self.region_dir_names: region = get_region( region_code, is_direct_ingest=True, region_module_override=self.region_module_override, ) controller_class = region.get_ingestor_class() if not issubclass(controller_class, GcsfsDirectIngestController): continue builders = DirectIngestPreProcessedIngestViewCollector( region, controller_class.get_file_tag_rank_list() ).collect_view_builders() for builder in builders: builder.build()
def test_raw_files_yaml_parses_all_regions(self) -> None: for region_code in self.region_dir_names: region = get_region( region_code, is_direct_ingest=True, region_module_override=self.region_module_override, ) with patch("recidiviz.utils.metadata.project_id", return_value="recidiviz-456"): controller = DirectIngestControllerFactory.build( ingest_bucket_path=self.primary_ingest_bucket_for_region( region), allow_unlaunched=True, ) builders = DirectIngestPreProcessedIngestViewCollector( region, controller.get_file_tag_rank_list()).collect_view_builders() raw_file_manager = DirectIngestRegionRawFileConfig( region_code=region.region_code, region_module=self.region_module_override, ) if builders or raw_file_manager.raw_file_configs: if region.is_ingest_launched_in_env() is not None: self.test.assertTrue(raw_file_manager.raw_file_configs) config_file_tags = set() for config in raw_file_manager.raw_file_configs.values(): self.test.assertTrue( config.file_tag not in config_file_tags, f"Multiple raw file configs defined with the same " f"file_tag [{config.file_tag}]", ) config_file_tags.add(config.file_tag)
) for metadata in metadata_list ] if __name__ == "__main__": # Update these variables and run to print an export query you can run in the BigQuery UI region_code_: str = "us_mo" ingest_view_name_: str = "tak001_offender_identification" upper_bound_datetime_prev_: datetime.datetime = datetime.datetime(2020, 10, 15) upper_bound_datetime_to_export_: datetime.datetime = datetime.datetime(2020, 12, 18) with local_project_id_override(GCP_PROJECT_STAGING): region_ = regions.get_region(region_code_, is_direct_ingest=True) view_collector_ = DirectIngestPreProcessedIngestViewCollector(region_, []) views_by_tag_ = { builder.file_tag: builder.build() for builder in view_collector_.collect_view_builders() } debug_query = DirectIngestIngestViewExportManager.debug_query_for_args( views_by_tag_, GcsfsIngestViewExportArgs( ingest_view_name=ingest_view_name_, upper_bound_datetime_prev=upper_bound_datetime_prev_, upper_bound_datetime_to_export=upper_bound_datetime_to_export_, ), ) print(debug_query)
for metadata in metadata_list ] if __name__ == '__main__': # Update these variables and run to print an export query you can run in the BigQuery UI region_code_: str = 'us_id' ingest_view_name_: str = 'movement_facility_location_offstat_supervision_periods' upper_bound_datetime_prev_: datetime.datetime = datetime.datetime( 2020, 6, 29) upper_bound_datetime_to_export_: datetime.datetime = datetime.datetime( 2020, 7, 29) with local_project_id_override(GCP_PROJECT_STAGING): region_ = regions.get_region(region_code_, is_direct_ingest=True) view_collector_ = DirectIngestPreProcessedIngestViewCollector( region_, []) views_by_tag_ = { view.file_tag: view for view in view_collector_.collect_views() } DirectIngestIngestViewExportManager.print_debug_query_for_args( views_by_tag_, GcsfsIngestViewExportArgs( ingest_view_name=ingest_view_name_, upper_bound_datetime_prev=upper_bound_datetime_prev_, upper_bound_datetime_to_export=upper_bound_datetime_to_export_) )