def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]: """Generates documentation for all raw file configs for the given region and returns all of it as a combined string. Returns one Markdown-formatted string per raw file, mapped to its filename, as well as a header file with a table of contents. """ region_config = DirectIngestRegionRawFileConfig(region_code=region_code) sorted_file_tags = sorted(region_config.raw_file_tags) if StateCode.is_state_code(region_code): state_code = StateCode(region_code.upper()) state_name = state_code.get_state().name file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format( state_name=state_name, state_code_lower=state_code.value.lower() ) else: file_header = "" raw_file_configs = [ region_config.raw_file_configs[file_tag] for file_tag in sorted_file_tags ] config_paths_by_file_tag = { file_tag: file_config.file_path for file_tag, file_config in region_config.raw_file_configs.items() } file_tags_with_raw_file_configs = [ raw_file_config.file_tag for raw_file_config in raw_file_configs ] region = regions.get_region(region_code=region_code, is_direct_ingest=True) view_collector = DirectIngestPreProcessedIngestViewCollector(region, []) views_by_raw_file = self.get_referencing_views(view_collector) touched_configs = self._get_touched_raw_data_configs( region_config.yaml_config_file_dir ) raw_file_table = self._generate_raw_file_table( config_paths_by_file_tag, file_tags_with_raw_file_configs, views_by_raw_file, touched_configs, ) docs_per_file: Dict[str, str] = { f"{config.file_tag}.md": self._generate_docs_for_raw_config(config) for config in raw_file_configs } docs_per_file[STATE_RAW_DATA_FILE_HEADER_PATH] = ( file_header + "\n" + raw_file_table ) return docs_per_file
def generate_raw_file_docs_for_region(self, region_code: str) -> str: """Generates documentation for all raw file configs for the given region and returns all of it as a combined string.""" region_config = DirectIngestRegionRawFileConfig( region_code=region_code) sorted_file_tags = sorted(region_config.raw_file_tags) if StateCode.is_state_code(region_code): state_code = StateCode(region_code.upper()) state_name = state_code.get_state() file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format( state_name=state_name, state_code_lower=state_code.value.lower()) else: file_header = "" raw_file_configs = [ region_config.raw_file_configs[file_tag] for file_tag in sorted_file_tags ] config_paths_by_file_tag = { file_tag: file_config.file_path for file_tag, file_config in region_config.raw_file_configs.items() } file_tags_with_raw_file_configs = [ raw_file_config.file_tag for raw_file_config in raw_file_configs ] region = regions.get_region(region_code=region_code, is_direct_ingest=True) view_collector = DirectIngestPreProcessedIngestViewCollector( region, []) views_by_raw_file = self.get_referencing_views(view_collector) raw_file_table = self._generate_raw_file_table( config_paths_by_file_tag, file_tags_with_raw_file_configs, views_by_raw_file) docs_per_file = [ self._generate_docs_for_raw_config(config) for config in raw_file_configs ] return file_header + "\n" + raw_file_table + "\n" + "\n\n".join( docs_per_file)
def _get_product_enabled_states(self) -> Set[StateCode]: states: Set[str] = set() for product in self.products: if product.states is not None: states = states.union( {state.state_code for state in product.states}) for state_code in states: if not StateCode.is_state_code(state_code): raise ValueError( f"Found invalid state code value [{state_code}]" f" in product config.") return {StateCode(state_code) for state_code in states}
def get_ingest_view_configs( region_code: str, ) -> List[DataDiscoveryStandardizedFileConfig]: """Collect ingest views for region; reads columns from their corresponding fixture csv""" if not StateCode.is_state_code(region_code): raise ValueError( f"Unknown region_code [{region_code}] received, must be a valid state code." ) region_code = region_code.lower() views = DirectIngestPreProcessedIngestViewCollector( get_region(region_code, True), []).collect_view_builders() configs = [] for view in views: try: # TODO(#6925) Infer columns from the mapping file rather than the fixture csv fixture_path = os.path.join( os.path.dirname(recidiviz.__file__), f"tests/ingest/direct/direct_ingest_fixtures/{region_code}/{view.ingest_view_name}.csv", ) with open(fixture_path, "r") as f: columns = f.readline().split(",") except FileNotFoundError: continue standardized_config = DataDiscoveryStandardizedFileConfig( file_tag=view.ingest_view_name, columns=columns, ) configs.append(standardized_config) return configs
def main(argv: Optional[Sequence[str]] = None) -> int: """Generates direct ingest region documentation.""" parser = argparse.ArgumentParser() parser.add_argument( "filenames", nargs="*", help="Modified files to indicate which regions need their docs to be regenerated. " "Paths must be relative to the root of the repository. " "If none are provided, will use `git diff` to determine modified files.", ) args = parser.parse_args(argv) # Arbitrary project ID - we just need to build views in order to obtain raw table dependencies with local_project_id_override(GCP_PROJECT_STAGING): modified = False touched_raw_data_regions = get_touched_raw_data_regions(args.filenames) for region_code in touched_raw_data_regions: if not StateCode.is_state_code(region_code): logging.info( "Skipping raw data documentation for non-state region [%s]", region_code, ) continue logging.info( "Generating raw data documentation for region [%s]", region_code ) modified |= generate_raw_data_documentation_for_region(region_code) if modified: update_summary_file( _create_ingest_catalog_summary(), "## State Ingest Catalog" ) return 1 if modified else 0
def _acquire_ingest_lock() -> Tuple[str, HTTPStatus]: try: state_code = StateCode(request.json["stateCode"]) ingest_instance = DirectIngestInstance( request.json["ingestInstance"]) except ValueError: return "invalid parameters provided", HTTPStatus.BAD_REQUEST lock_manager = DirectIngestRegionLockManager.for_state_ingest( state_code, ingest_instance=ingest_instance) try: lock_manager.acquire_lock() except GCSPseudoLockAlreadyExists: return "lock already exists", HTTPStatus.CONFLICT if not lock_manager.can_proceed(): try: lock_manager.release_lock() except Exception as e: logging.exception(e) return ( "other locks blocking ingest have been acquired; releasing lock", HTTPStatus.CONFLICT, ) return "", HTTPStatus.OK
def _get_state_code_from_str(state_code_str: str) -> StateCode: if not StateCode.is_state_code(state_code_str): raise ValueError( f"Unknown region_code [{state_code_str}] received, must be a valid state code." ) return StateCode[state_code_str.upper()]
def read_db_entity_trees_of_cls_to_merge( session: Session, state_code: str, schema_cls: Type[StateBase]) -> List[List[EntityTree]]: """ Returns a list of lists of EntityTree where each inner list is a group of EntityTrees with entities of class |schema_cls| that need to be merged because their entities have the same external_id. Will assert if schema_cls does not have a person_id or external_id field. """ if not StateCode.is_valid(state_code): raise ValueError(f"Invalid state code: [{state_code}]") external_ids = dao.read_external_ids_of_cls_with_external_id_match( session, state_code, schema_cls) people = dao.read_people_by_cls_external_ids(session, state_code, schema_cls, external_ids) all_cls_trees = get_all_entity_trees_of_cls(people, schema_cls) external_ids_map: Dict[str, List[EntityTree]] = defaultdict(list) for tree in all_cls_trees: if not isinstance(tree.entity, schema_cls): raise ValueError(f"Unexpected entity type [{type(tree.entity)}]") if tree.entity.external_id in external_ids: external_ids_map[tree.entity.external_id].append(tree) return [tree_list for _, tree_list in external_ids_map.items()]
def __init__( self, project_id: str, region: str, lower_bound_update_datetime: Optional[datetime.datetime], gcs_destination_path: Optional[GcsfsDirectoryPath] = None, ): self.project_id = project_id self.region = region.lower() self.auth = SftpAuth.for_region(region) self.delegate = SftpDownloadDelegateFactory.build(region_code=region) self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.unable_to_download_items: List[str] = [] self.downloaded_items: List[Tuple[str, datetime.datetime]] = [] self.skipped_files: List[str] = [] self.lower_bound_update_datetime = lower_bound_update_datetime self.bucket = (gcsfs_sftp_download_bucket_path_for_region( region, SystemLevel.STATE, project_id=self.project_id) if gcs_destination_path is None else gcs_destination_path) self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir( dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY) self.postgres_direct_ingest_file_metadata_manager = ( PostgresDirectIngestRawFileMetadataManager( region, DirectIngestInstance.PRIMARY.database_version( SystemLevel.STATE, state_code=StateCode(self.region.upper())).name, ))
def _get_dataflow_pipeline_enabled_states(self) -> Set[StateCode]: """Returns the set of StateCodes for all states present in our production calc pipeline template.""" states = { pipeline.peek("state_code", str).upper() for pipeline in self.daily_pipelines }.union({ pipeline.peek("state_code", str).upper() for pipeline in self.historical_pipelines }) for state_code in states: if not StateCode.is_state_code(state_code): raise ValueError( f"Found invalid state code value [{state_code}]" f" in pipeline template config.") return {StateCode(state_code) for state_code in states}
def get_data_folder( drive: Drive, state_code: states.StateCode, system: schema.System, base_drive_folder_id: str, ) -> DriveItem: state_folder = drive.get_folder(state_code.get_state().name, base_drive_folder_id) corrections_folder = drive.get_folder(system.value.title(), state_folder.id) return drive.get_folder("Data", corrections_folder.id)
def _main_database_key(cls) -> "SQLAlchemyDatabaseKey": if cls.schema_type() == SchemaType.STATE: state_code = StateCode(cls.region_code().upper()) return SQLAlchemyDatabaseKey.for_state_code( state_code, cls._main_ingest_instance().database_version( SystemLevel.STATE, state_code=state_code), ) return SQLAlchemyDatabaseKey.for_schema(cls.schema_type())
def _ingest_lock_name_for_instance(self) -> str: if StateCode.is_state_code(self.region_code): return ( STATE_GCS_TO_POSTGRES_INGEST_RUNNING_LOCK_PREFIX + self.region_code.upper() + f"_{self.ingest_instance.name}" ) return ( JAILS_GCS_TO_POSTGRES_INGEST_RUNNING_LOCK_PREFIX + self.region_code.upper() )
def test_get_batch_ids_valid_arguments(self) -> None: """Given all valid arguments, should have a list of batch ids, ordered in descending order, since we want the most recent batch to be at the top of the list""" self._upload_fake_email_buckets() batch_list = self.admin_stores.get_batch_ids(state_code=StateCode( self.STATE_CODE_STR), override_fs=self.fs) self.assertEqual( ["20210701202022", "20210701202021", "20210701202020"], batch_list)
def _get_translated_key_column_mask(self) -> int: """Returns an integer mask to add to every primary/foreign key column in this query. The mask is stable across all tables and derived from the region code. Example: 46000000000000 For the above mask, if a primary key is 123456 in Postgres, then the translated primary key would be 46000000123456. """ if not self.region_code: raise ValueError( "Must have set region code to do primary/foreign key translation." ) if not StateCode.is_state_code(self.region_code): raise ValueError( "No support yet for doing primary/foreign key translation on non-state " "regions.") # The FIPS code is always a two-digit code for states fips = int(StateCode(self.region_code).get_state().fips) return fips * pow(10, 12)
def from_report_json(report_json: dict) -> "Recipient": saved_report_json = copy.deepcopy(report_json) saved_report_json[utils.KEY_STATE_CODE] = StateCode( report_json[utils.KEY_STATE_CODE] ) return Recipient( email_address=saved_report_json[utils.KEY_EMAIL_ADDRESS], state_code=saved_report_json[utils.KEY_STATE_CODE], district=saved_report_json[utils.KEY_DISTRICT], data=saved_report_json, )
def ingest_database_key(self) -> SQLAlchemyDatabaseKey: schema_type = self.system_level.schema_type() if schema_type == SchemaType.STATE: state_code = StateCode(self.region_code().upper()) return SQLAlchemyDatabaseKey.for_state_code( state_code, self.ingest_instance.database_version(self.system_level, state_code=state_code), ) return SQLAlchemyDatabaseKey.for_schema(schema_type)
def _create_ingest_catalog_summary() -> List[str]: """Creates the State Ingest Catalog portion of SUMMARY.md, as a list of lines.""" ingest_catalog_states = sorted( [ f.lower() for f in listdir(_INGEST_CATALOG_ROOT) if isdir(join(_INGEST_CATALOG_ROOT, f)) ] ) ingest_catalog_summary = ["## State Ingest Catalog\n\n"] for state in ingest_catalog_states: if StateCode.is_state_code(state): state_code = StateCode(state.upper()) state_name = state_code.get_state() else: raise ValueError( f"Folder under {_INGEST_CATALOG_ROOT} named {state} is not a valid state code" ) ingest_catalog_summary.extend( [ f"- [{state_name}](ingest/{state}/{state}.md)\n", f" - [Schema Mappings](ingest/{state}/schema_mappings.md)\n", f" - [Raw Data Description](ingest/{state}/raw_data.md)\n", ] ) raw_data_dir = join(_INGEST_CATALOG_ROOT, state, "raw_data") if not isdir(raw_data_dir): continue raw_data_files = sorted( [f for f in listdir(raw_data_dir) if isfile(join(raw_data_dir, f))] ) for file_name in raw_data_files: ingest_catalog_summary.append( f" - [{file_name[:-3]}](ingest/{state}/raw_data/{file_name})\n" ) return ingest_catalog_summary
def _batch_ids() -> Tuple[str, HTTPStatus]: try: data = request.json state_code = StateCode(data.get("stateCode")) if state_code not in EMAIL_STATE_CODES: raise ValueError("State code is invalid for retrieving batch ids") except ValueError as error: logging.error(error) return str(error), HTTPStatus.BAD_REQUEST gcsfs_batch_ids = admin_stores.get_batch_ids(state_code) return (jsonify({"batchIds": gcsfs_batch_ids}), HTTPStatus.OK)
def for_region_code(cls, region_code: str, is_direct_ingest: bool) -> "SystemLevel": if is_direct_ingest is None: raise ValueError( "Region flag is_direct_ingest is None, expected boolean value." ) if not is_direct_ingest: # There are some scrapers that scrape state jails websites (e.g. # recidiviz/ingest/scrape/regions/us_pa/us_pa_scraper.py) which we always # write to the Vera county jails database. return SystemLevel.COUNTY if StateCode.is_state_code(region_code.upper()): return SystemLevel.STATE return SystemLevel.COUNTY
def get_states_by_product( self, ) -> Dict[ProductName, Dict[GCPEnvironment, List[StateCode]]]: """Returns the dict of products to states and environments.""" states_by_product: Dict[ProductName, Dict[ GCPEnvironment, List[StateCode]]] = defaultdict(lambda: defaultdict(list)) for product in self.products: if product.states is not None: for state in product.states: environment = GCPEnvironment(state.environment) state_code = StateCode(state.state_code) states_by_product[product.name][environment].append( state_code) return states_by_product
def _move_events_onto_supervision_periods_for_person( matched_persons: List[schema.StatePerson], event_cls: Type[DatabaseEntity], event_field_name: str, state_code: str, ) -> None: """For each person in |matched_persons|, moves all events of type |event_cls| onto the |event_field_name| field of a matching supervision period, based on date. If there is no matching supervision period, ensures that the events hang off of a placeholder chain. """ if not StateCode.is_valid(state_code): raise ValueError(f"Invalid state code: [{state_code}]") for person in matched_persons: unmatched_events = _move_events_onto_supervision_periods( person, event_cls, event_field_name ) if not unmatched_events: continue # We may hit this case if an entity that has already been committed to the DB has a date updated in a # later run such that the dates of the existing supervision periods no longer line up with one of the # existing events. In this case, we want to store the event on a placeholder chain starting at sentence_group. # We do this to show that the supervision violation isn't associated with anything other than the person. placeholder_sg = get_or_create_placeholder_child( person, "sentence_groups", schema.StateSentenceGroup, state_code=state_code, status=StateSentenceStatus.PRESENT_WITHOUT_INFO.value, ) placeholder_s = get_or_create_placeholder_child( placeholder_sg, "supervision_sentences", schema.StateSupervisionSentence, person=person, state_code=state_code, status=StateSentenceStatus.PRESENT_WITHOUT_INFO.value, ) placeholder_sp = get_or_create_placeholder_child( placeholder_s, "supervision_periods", schema.StateSupervisionPeriod, person=person, state_code=state_code, status=StateSupervisionPeriodStatus.PRESENT_WITHOUT_INFO.value, ) placeholder_sp.set_field_from_list(event_field_name, unmatched_events)
def _release_ingest_lock() -> Tuple[str, HTTPStatus]: try: state_code = StateCode(request.json["stateCode"]) ingest_instance = DirectIngestInstance( request.json["ingestInstance"]) except ValueError: return "invalid parameters provided", HTTPStatus.BAD_REQUEST lock_manager = DirectIngestRegionLockManager.for_state_ingest( state_code, ingest_instance=ingest_instance) try: lock_manager.release_lock() except GCSPseudoLockDoesNotExist: return "lock does not exist", HTTPStatus.NOT_FOUND return "", HTTPStatus.OK
def test_state_codes_match_terraform_config(self) -> None: yaml_path = os.path.join( os.path.dirname(deploy.__file__), "terraform", "direct_ingest_state_codes.yaml", ) with open(yaml_path, "r") as ymlfile: region_codes_list = yaml.full_load(ymlfile) for region in self.region_dir_names: if not StateCode.is_state_code(region): continue self.assertTrue( region.upper() in region_codes_list, f"State [{region}] must be listed in [{yaml_path}]", )
def _import_database_from_gcs() -> Tuple[str, HTTPStatus]: try: state_code = StateCode(request.json["stateCode"]) db_version = SQLAlchemyStateDatabaseVersion( request.json["importToDatabaseVersion"].lower()) ingest_instance = DirectIngestInstance.for_state_database_version( database_version=db_version, state_code=state_code) exported_db_version = SQLAlchemyStateDatabaseVersion( request.json["exportedDatabaseVersion"].lower()) except ValueError: return "invalid parameters provided", HTTPStatus.BAD_REQUEST if db_version == SQLAlchemyStateDatabaseVersion.LEGACY: return "ingestInstance cannot be LEGACY", HTTPStatus.BAD_REQUEST lock_manager = DirectIngestRegionLockManager.for_state_ingest( state_code, ingest_instance=ingest_instance) if not lock_manager.can_proceed(): return ( "other locks blocking ingest have been acquired; aborting operation", HTTPStatus.CONFLICT, ) db_key = SQLAlchemyDatabaseKey.for_state_code(state_code, db_version) cloud_sql_client = CloudSQLClientImpl(project_id=project_id) operation_id = cloud_sql_client.import_gcs_sql( db_key, GcsfsFilePath.from_absolute_path( f"{STATE_INGEST_EXPORT_URI}/{exported_db_version.value}/{state_code.value}" ), ) if operation_id is None: return ( "Cloud SQL import operation was not started successfully.", HTTPStatus.INTERNAL_SERVER_ERROR, ) operation_succeeded = cloud_sql_client.wait_until_operation_completed( operation_id, seconds_to_wait=GCS_IMPORT_EXPORT_TIMEOUT_SEC) if not operation_succeeded: return ( "Cloud SQL import did not complete within 60 seconds", HTTPStatus.INTERNAL_SERVER_ERROR, ) return operation_id, HTTPStatus.OK
def get_export_configs_for_job_filter( self, export_job_filter: str ) -> List[ProductExportConfig]: """Returns the export configs for the given export_job_filter, which can be either state_code or export job name.""" filter_uppercase = export_job_filter.upper() if StateCode.is_state_code(filter_uppercase): return [ export for export in self.get_all_export_configs() if export["state_code"] == filter_uppercase ] return [ export for export in self.get_all_export_configs() if export["export_job_name"] == filter_uppercase ]
def _get_state_metric_calculations( pipelines: List[YAMLDict], frequency: str) -> Dict[str, List[PipelineMetricInfo]]: """Returns a dict of state names to lists of info about their regularly calculated metrics.""" state_metric_calculations = defaultdict(list) for pipeline in pipelines: state_metric_calculations[str( StateCode(pipeline.peek("state_code", str)).get_state() )].extend([ PipelineMetricInfo( name=metric, month_count=pipeline.peek_optional( "calculation_month_count", int), frequency=frequency, ) for metric in pipeline.peek("metric_types", str).split() ], ) return state_metric_calculations
def download_data( state_code: states.StateCode, system: schema.System, base_drive_folder_id: str, base_local_directory: str, credentials_directory: str, ) -> None: local_directory = os.path.join(base_local_directory, state_code.value, system.value) os.makedirs(local_directory, exist_ok=True) drive = Drive(credentials_directory) state_folder = drive.get_folder(state_code.get_state().name, base_drive_folder_id) corrections_folder = drive.get_folder(system.value.title(), state_folder.id) data_folder = drive.get_folder("Data", corrections_folder.id) drive.download_data(data_folder.id, local_directory=local_directory)
def _pause_direct_ingest_instance() -> Tuple[str, HTTPStatus]: try: state_code = StateCode(request.json["stateCode"]) ingest_instance = DirectIngestInstance( request.json["ingestInstance"]) except ValueError: return "invalid parameters provided", HTTPStatus.BAD_REQUEST ingest_status_manager = DirectIngestInstanceStatusManager( region_code=state_code.value, ingest_instance=ingest_instance) try: ingest_status_manager.pause_instance() except Exception: return ( "something went wrong pausing the intance", HTTPStatus.INTERNAL_SERVER_ERROR, ) return "", HTTPStatus.OK
def __init__( self, project_id: str, region: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str], ): self.project_id = project_id self.region = region self.state_code = StateCode(region.upper()) self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region( region_code=region, system_level=SystemLevel.STATE, # Raw files are only ever stored in the PRIMARY storage bucket ingest_instance=DirectIngestInstance.PRIMARY, project_id=self.project_id, ) self.ingest_bucket = gcsfs_direct_ingest_bucket_for_region( region_code=region, system_level=SystemLevel.STATE, # Raw files are only ever processed in the PRIMARY ingest bucket ingest_instance=DirectIngestInstance.PRIMARY, project_id=self.project_id, ) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_" f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt", )