Example #1
0
    def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]:
        """Generates documentation for all raw file configs for the given region and
        returns all of it as a combined string.

        Returns one Markdown-formatted string per raw file, mapped to its filename, as
        well as a header file with a table of contents.
        """
        region_config = DirectIngestRegionRawFileConfig(region_code=region_code)

        sorted_file_tags = sorted(region_config.raw_file_tags)

        if StateCode.is_state_code(region_code):
            state_code = StateCode(region_code.upper())
            state_name = state_code.get_state().name

            file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format(
                state_name=state_name, state_code_lower=state_code.value.lower()
            )
        else:
            file_header = ""

        raw_file_configs = [
            region_config.raw_file_configs[file_tag] for file_tag in sorted_file_tags
        ]

        config_paths_by_file_tag = {
            file_tag: file_config.file_path
            for file_tag, file_config in region_config.raw_file_configs.items()
        }

        file_tags_with_raw_file_configs = [
            raw_file_config.file_tag for raw_file_config in raw_file_configs
        ]

        region = regions.get_region(region_code=region_code, is_direct_ingest=True)

        view_collector = DirectIngestPreProcessedIngestViewCollector(region, [])
        views_by_raw_file = self.get_referencing_views(view_collector)
        touched_configs = self._get_touched_raw_data_configs(
            region_config.yaml_config_file_dir
        )

        raw_file_table = self._generate_raw_file_table(
            config_paths_by_file_tag,
            file_tags_with_raw_file_configs,
            views_by_raw_file,
            touched_configs,
        )

        docs_per_file: Dict[str, str] = {
            f"{config.file_tag}.md": self._generate_docs_for_raw_config(config)
            for config in raw_file_configs
        }

        docs_per_file[STATE_RAW_DATA_FILE_HEADER_PATH] = (
            file_header + "\n" + raw_file_table
        )

        return docs_per_file
Example #2
0
    def generate_raw_file_docs_for_region(self, region_code: str) -> str:
        """Generates documentation for all raw file configs for the given region and returns all of it
        as a combined string."""
        region_config = DirectIngestRegionRawFileConfig(
            region_code=region_code)

        sorted_file_tags = sorted(region_config.raw_file_tags)

        if StateCode.is_state_code(region_code):
            state_code = StateCode(region_code.upper())
            state_name = state_code.get_state()

            file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format(
                state_name=state_name,
                state_code_lower=state_code.value.lower())
        else:
            file_header = ""

        raw_file_configs = [
            region_config.raw_file_configs[file_tag]
            for file_tag in sorted_file_tags
        ]

        config_paths_by_file_tag = {
            file_tag: file_config.file_path
            for file_tag, file_config in
            region_config.raw_file_configs.items()
        }

        file_tags_with_raw_file_configs = [
            raw_file_config.file_tag for raw_file_config in raw_file_configs
        ]

        region = regions.get_region(region_code=region_code,
                                    is_direct_ingest=True)

        view_collector = DirectIngestPreProcessedIngestViewCollector(
            region, [])
        views_by_raw_file = self.get_referencing_views(view_collector)

        raw_file_table = self._generate_raw_file_table(
            config_paths_by_file_tag, file_tags_with_raw_file_configs,
            views_by_raw_file)

        docs_per_file = [
            self._generate_docs_for_raw_config(config)
            for config in raw_file_configs
        ]

        return file_header + "\n" + raw_file_table + "\n" + "\n\n".join(
            docs_per_file)
    def _get_product_enabled_states(self) -> Set[StateCode]:
        states: Set[str] = set()
        for product in self.products:
            if product.states is not None:
                states = states.union(
                    {state.state_code
                     for state in product.states})

        for state_code in states:
            if not StateCode.is_state_code(state_code):
                raise ValueError(
                    f"Found invalid state code value [{state_code}]"
                    f" in product config.")
        return {StateCode(state_code) for state_code in states}
Example #4
0
def get_ingest_view_configs(
    region_code: str, ) -> List[DataDiscoveryStandardizedFileConfig]:
    """Collect ingest views for region; reads columns from their corresponding fixture csv"""
    if not StateCode.is_state_code(region_code):
        raise ValueError(
            f"Unknown region_code [{region_code}] received, must be a valid state code."
        )

    region_code = region_code.lower()

    views = DirectIngestPreProcessedIngestViewCollector(
        get_region(region_code, True), []).collect_view_builders()

    configs = []
    for view in views:
        try:
            # TODO(#6925) Infer columns from the mapping file rather than the fixture csv
            fixture_path = os.path.join(
                os.path.dirname(recidiviz.__file__),
                f"tests/ingest/direct/direct_ingest_fixtures/{region_code}/{view.ingest_view_name}.csv",
            )

            with open(fixture_path, "r") as f:
                columns = f.readline().split(",")
        except FileNotFoundError:
            continue

        standardized_config = DataDiscoveryStandardizedFileConfig(
            file_tag=view.ingest_view_name,
            columns=columns,
        )

        configs.append(standardized_config)

    return configs
Example #5
0
def main(argv: Optional[Sequence[str]] = None) -> int:
    """Generates direct ingest region documentation."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "filenames",
        nargs="*",
        help="Modified files to indicate which regions need their docs to be regenerated. "
        "Paths must be relative to the root of the repository. "
        "If none are provided, will use `git diff` to determine modified files.",
    )
    args = parser.parse_args(argv)

    # Arbitrary project ID - we just need to build views in order to obtain raw table dependencies
    with local_project_id_override(GCP_PROJECT_STAGING):
        modified = False
        touched_raw_data_regions = get_touched_raw_data_regions(args.filenames)
        for region_code in touched_raw_data_regions:
            if not StateCode.is_state_code(region_code):
                logging.info(
                    "Skipping raw data documentation for non-state region [%s]",
                    region_code,
                )
                continue
            logging.info(
                "Generating raw data documentation for region [%s]", region_code
            )
            modified |= generate_raw_data_documentation_for_region(region_code)
        if modified:
            update_summary_file(
                _create_ingest_catalog_summary(), "## State Ingest Catalog"
            )
        return 1 if modified else 0
Example #6
0
    def _acquire_ingest_lock() -> Tuple[str, HTTPStatus]:
        try:
            state_code = StateCode(request.json["stateCode"])
            ingest_instance = DirectIngestInstance(
                request.json["ingestInstance"])
        except ValueError:
            return "invalid parameters provided", HTTPStatus.BAD_REQUEST

        lock_manager = DirectIngestRegionLockManager.for_state_ingest(
            state_code, ingest_instance=ingest_instance)
        try:
            lock_manager.acquire_lock()
        except GCSPseudoLockAlreadyExists:
            return "lock already exists", HTTPStatus.CONFLICT

        if not lock_manager.can_proceed():
            try:
                lock_manager.release_lock()
            except Exception as e:
                logging.exception(e)
            return (
                "other locks blocking ingest have been acquired; releasing lock",
                HTTPStatus.CONFLICT,
            )

        return "", HTTPStatus.OK
Example #7
0
def _get_state_code_from_str(state_code_str: str) -> StateCode:
    if not StateCode.is_state_code(state_code_str):
        raise ValueError(
            f"Unknown region_code [{state_code_str}] received, must be a valid state code."
        )

    return StateCode[state_code_str.upper()]
Example #8
0
def read_db_entity_trees_of_cls_to_merge(
        session: Session, state_code: str,
        schema_cls: Type[StateBase]) -> List[List[EntityTree]]:
    """
    Returns a list of lists of EntityTree where each inner list is a group
    of EntityTrees with entities of class |schema_cls| that need to be merged
    because their entities have the same external_id.

    Will assert if schema_cls does not have a person_id or external_id field.
    """
    if not StateCode.is_valid(state_code):
        raise ValueError(f"Invalid state code: [{state_code}]")

    external_ids = dao.read_external_ids_of_cls_with_external_id_match(
        session, state_code, schema_cls)
    people = dao.read_people_by_cls_external_ids(session, state_code,
                                                 schema_cls, external_ids)
    all_cls_trees = get_all_entity_trees_of_cls(people, schema_cls)

    external_ids_map: Dict[str, List[EntityTree]] = defaultdict(list)
    for tree in all_cls_trees:
        if not isinstance(tree.entity, schema_cls):
            raise ValueError(f"Unexpected entity type [{type(tree.entity)}]")

        if tree.entity.external_id in external_ids:
            external_ids_map[tree.entity.external_id].append(tree)

    return [tree_list for _, tree_list in external_ids_map.items()]
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[GcsfsDirectoryPath] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []
        self.skipped_files: List[str] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (gcsfs_sftp_download_bucket_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id) if
                       gcs_destination_path is None else gcs_destination_path)
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY)

        self.postgres_direct_ingest_file_metadata_manager = (
            PostgresDirectIngestRawFileMetadataManager(
                region,
                DirectIngestInstance.PRIMARY.database_version(
                    SystemLevel.STATE,
                    state_code=StateCode(self.region.upper())).name,
            ))
    def _get_dataflow_pipeline_enabled_states(self) -> Set[StateCode]:
        """Returns the set of StateCodes for all states present in our production calc
        pipeline template."""
        states = {
            pipeline.peek("state_code", str).upper()
            for pipeline in self.daily_pipelines
        }.union({
            pipeline.peek("state_code", str).upper()
            for pipeline in self.historical_pipelines
        })

        for state_code in states:
            if not StateCode.is_state_code(state_code):
                raise ValueError(
                    f"Found invalid state code value [{state_code}]"
                    f" in pipeline template config.")

        return {StateCode(state_code) for state_code in states}
Example #11
0
def get_data_folder(
    drive: Drive,
    state_code: states.StateCode,
    system: schema.System,
    base_drive_folder_id: str,
) -> DriveItem:
    state_folder = drive.get_folder(state_code.get_state().name, base_drive_folder_id)
    corrections_folder = drive.get_folder(system.value.title(), state_folder.id)
    return drive.get_folder("Data", corrections_folder.id)
 def _main_database_key(cls) -> "SQLAlchemyDatabaseKey":
     if cls.schema_type() == SchemaType.STATE:
         state_code = StateCode(cls.region_code().upper())
         return SQLAlchemyDatabaseKey.for_state_code(
             state_code,
             cls._main_ingest_instance().database_version(
                 SystemLevel.STATE, state_code=state_code),
         )
     return SQLAlchemyDatabaseKey.for_schema(cls.schema_type())
Example #13
0
 def _ingest_lock_name_for_instance(self) -> str:
     if StateCode.is_state_code(self.region_code):
         return (
             STATE_GCS_TO_POSTGRES_INGEST_RUNNING_LOCK_PREFIX
             + self.region_code.upper()
             + f"_{self.ingest_instance.name}"
         )
     return (
         JAILS_GCS_TO_POSTGRES_INGEST_RUNNING_LOCK_PREFIX + self.region_code.upper()
     )
    def test_get_batch_ids_valid_arguments(self) -> None:
        """Given all valid arguments, should have a list of batch ids, ordered in descending order,
        since we want the most recent batch to be at the top of the list"""
        self._upload_fake_email_buckets()
        batch_list = self.admin_stores.get_batch_ids(state_code=StateCode(
            self.STATE_CODE_STR),
                                                     override_fs=self.fs)

        self.assertEqual(
            ["20210701202022", "20210701202021", "20210701202020"], batch_list)
    def _get_translated_key_column_mask(self) -> int:
        """Returns an integer mask to add to every primary/foreign key column in this
        query. The mask is stable across all tables and derived from the region code.

        Example: 46000000000000

        For the above mask, if a primary key is 123456 in Postgres, then the translated
        primary key would be 46000000123456.
        """
        if not self.region_code:
            raise ValueError(
                "Must have set region code to do primary/foreign key translation."
            )
        if not StateCode.is_state_code(self.region_code):
            raise ValueError(
                "No support yet for doing primary/foreign key translation on non-state "
                "regions.")
        # The FIPS code is always a two-digit code for states
        fips = int(StateCode(self.region_code).get_state().fips)
        return fips * pow(10, 12)
Example #16
0
 def from_report_json(report_json: dict) -> "Recipient":
     saved_report_json = copy.deepcopy(report_json)
     saved_report_json[utils.KEY_STATE_CODE] = StateCode(
         report_json[utils.KEY_STATE_CODE]
     )
     return Recipient(
         email_address=saved_report_json[utils.KEY_EMAIL_ADDRESS],
         state_code=saved_report_json[utils.KEY_STATE_CODE],
         district=saved_report_json[utils.KEY_DISTRICT],
         data=saved_report_json,
     )
Example #17
0
    def ingest_database_key(self) -> SQLAlchemyDatabaseKey:
        schema_type = self.system_level.schema_type()
        if schema_type == SchemaType.STATE:
            state_code = StateCode(self.region_code().upper())
            return SQLAlchemyDatabaseKey.for_state_code(
                state_code,
                self.ingest_instance.database_version(self.system_level,
                                                      state_code=state_code),
            )

        return SQLAlchemyDatabaseKey.for_schema(schema_type)
Example #18
0
def _create_ingest_catalog_summary() -> List[str]:
    """Creates the State Ingest Catalog portion of SUMMARY.md, as a list of lines."""
    ingest_catalog_states = sorted(
        [
            f.lower()
            for f in listdir(_INGEST_CATALOG_ROOT)
            if isdir(join(_INGEST_CATALOG_ROOT, f))
        ]
    )

    ingest_catalog_summary = ["## State Ingest Catalog\n\n"]

    for state in ingest_catalog_states:
        if StateCode.is_state_code(state):
            state_code = StateCode(state.upper())
            state_name = state_code.get_state()
        else:
            raise ValueError(
                f"Folder under {_INGEST_CATALOG_ROOT} named {state} is not a valid state code"
            )

        ingest_catalog_summary.extend(
            [
                f"- [{state_name}](ingest/{state}/{state}.md)\n",
                f"  - [Schema Mappings](ingest/{state}/schema_mappings.md)\n",
                f"  - [Raw Data Description](ingest/{state}/raw_data.md)\n",
            ]
        )

        raw_data_dir = join(_INGEST_CATALOG_ROOT, state, "raw_data")
        if not isdir(raw_data_dir):
            continue
        raw_data_files = sorted(
            [f for f in listdir(raw_data_dir) if isfile(join(raw_data_dir, f))]
        )

        for file_name in raw_data_files:
            ingest_catalog_summary.append(
                f"    - [{file_name[:-3]}](ingest/{state}/raw_data/{file_name})\n"
            )
    return ingest_catalog_summary
Example #19
0
    def _batch_ids() -> Tuple[str, HTTPStatus]:
        try:
            data = request.json
            state_code = StateCode(data.get("stateCode"))
            if state_code not in EMAIL_STATE_CODES:
                raise ValueError("State code is invalid for retrieving batch ids")

        except ValueError as error:
            logging.error(error)
            return str(error), HTTPStatus.BAD_REQUEST

        gcsfs_batch_ids = admin_stores.get_batch_ids(state_code)

        return (jsonify({"batchIds": gcsfs_batch_ids}), HTTPStatus.OK)
Example #20
0
    def for_region_code(cls, region_code: str,
                        is_direct_ingest: bool) -> "SystemLevel":
        if is_direct_ingest is None:
            raise ValueError(
                "Region flag is_direct_ingest is None, expected boolean value."
            )
        if not is_direct_ingest:
            # There are some scrapers that scrape state jails websites (e.g.
            # recidiviz/ingest/scrape/regions/us_pa/us_pa_scraper.py) which we always
            # write to the Vera county jails database.
            return SystemLevel.COUNTY

        if StateCode.is_state_code(region_code.upper()):
            return SystemLevel.STATE
        return SystemLevel.COUNTY
    def get_states_by_product(
        self, ) -> Dict[ProductName, Dict[GCPEnvironment, List[StateCode]]]:
        """Returns the dict of products to states and environments."""
        states_by_product: Dict[ProductName, Dict[
            GCPEnvironment,
            List[StateCode]]] = defaultdict(lambda: defaultdict(list))
        for product in self.products:
            if product.states is not None:
                for state in product.states:
                    environment = GCPEnvironment(state.environment)
                    state_code = StateCode(state.state_code)
                    states_by_product[product.name][environment].append(
                        state_code)

        return states_by_product
def _move_events_onto_supervision_periods_for_person(
    matched_persons: List[schema.StatePerson],
    event_cls: Type[DatabaseEntity],
    event_field_name: str,
    state_code: str,
) -> None:
    """For each person in |matched_persons|, moves all events of type |event_cls| onto the |event_field_name| field of
    a matching supervision period, based on date. If there is no matching supervision period, ensures that the events
    hang off of a placeholder chain.
    """
    if not StateCode.is_valid(state_code):
        raise ValueError(f"Invalid state code: [{state_code}]")

    for person in matched_persons:
        unmatched_events = _move_events_onto_supervision_periods(
            person, event_cls, event_field_name
        )
        if not unmatched_events:
            continue

        # We may hit this case if an entity that has already been committed to the DB has a date updated in a
        # later run such that the dates of the existing supervision periods no longer line up with one of the
        # existing events. In this case, we want to store the event on a placeholder chain starting at sentence_group.
        # We do this to show that the supervision violation isn't associated with anything other than the person.
        placeholder_sg = get_or_create_placeholder_child(
            person,
            "sentence_groups",
            schema.StateSentenceGroup,
            state_code=state_code,
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO.value,
        )
        placeholder_s = get_or_create_placeholder_child(
            placeholder_sg,
            "supervision_sentences",
            schema.StateSupervisionSentence,
            person=person,
            state_code=state_code,
            status=StateSentenceStatus.PRESENT_WITHOUT_INFO.value,
        )
        placeholder_sp = get_or_create_placeholder_child(
            placeholder_s,
            "supervision_periods",
            schema.StateSupervisionPeriod,
            person=person,
            state_code=state_code,
            status=StateSupervisionPeriodStatus.PRESENT_WITHOUT_INFO.value,
        )
        placeholder_sp.set_field_from_list(event_field_name, unmatched_events)
Example #23
0
    def _release_ingest_lock() -> Tuple[str, HTTPStatus]:
        try:
            state_code = StateCode(request.json["stateCode"])
            ingest_instance = DirectIngestInstance(
                request.json["ingestInstance"])
        except ValueError:
            return "invalid parameters provided", HTTPStatus.BAD_REQUEST

        lock_manager = DirectIngestRegionLockManager.for_state_ingest(
            state_code, ingest_instance=ingest_instance)
        try:
            lock_manager.release_lock()
        except GCSPseudoLockDoesNotExist:
            return "lock does not exist", HTTPStatus.NOT_FOUND

        return "", HTTPStatus.OK
    def test_state_codes_match_terraform_config(self) -> None:
        yaml_path = os.path.join(
            os.path.dirname(deploy.__file__),
            "terraform",
            "direct_ingest_state_codes.yaml",
        )
        with open(yaml_path, "r") as ymlfile:
            region_codes_list = yaml.full_load(ymlfile)

        for region in self.region_dir_names:
            if not StateCode.is_state_code(region):
                continue
            self.assertTrue(
                region.upper() in region_codes_list,
                f"State [{region}] must be listed in [{yaml_path}]",
            )
Example #25
0
    def _import_database_from_gcs() -> Tuple[str, HTTPStatus]:
        try:
            state_code = StateCode(request.json["stateCode"])
            db_version = SQLAlchemyStateDatabaseVersion(
                request.json["importToDatabaseVersion"].lower())
            ingest_instance = DirectIngestInstance.for_state_database_version(
                database_version=db_version, state_code=state_code)
            exported_db_version = SQLAlchemyStateDatabaseVersion(
                request.json["exportedDatabaseVersion"].lower())
        except ValueError:
            return "invalid parameters provided", HTTPStatus.BAD_REQUEST

        if db_version == SQLAlchemyStateDatabaseVersion.LEGACY:
            return "ingestInstance cannot be LEGACY", HTTPStatus.BAD_REQUEST

        lock_manager = DirectIngestRegionLockManager.for_state_ingest(
            state_code, ingest_instance=ingest_instance)
        if not lock_manager.can_proceed():
            return (
                "other locks blocking ingest have been acquired; aborting operation",
                HTTPStatus.CONFLICT,
            )

        db_key = SQLAlchemyDatabaseKey.for_state_code(state_code, db_version)
        cloud_sql_client = CloudSQLClientImpl(project_id=project_id)

        operation_id = cloud_sql_client.import_gcs_sql(
            db_key,
            GcsfsFilePath.from_absolute_path(
                f"{STATE_INGEST_EXPORT_URI}/{exported_db_version.value}/{state_code.value}"
            ),
        )
        if operation_id is None:
            return (
                "Cloud SQL import operation was not started successfully.",
                HTTPStatus.INTERNAL_SERVER_ERROR,
            )

        operation_succeeded = cloud_sql_client.wait_until_operation_completed(
            operation_id, seconds_to_wait=GCS_IMPORT_EXPORT_TIMEOUT_SEC)
        if not operation_succeeded:
            return (
                "Cloud SQL import did not complete within 60 seconds",
                HTTPStatus.INTERNAL_SERVER_ERROR,
            )

        return operation_id, HTTPStatus.OK
Example #26
0
 def get_export_configs_for_job_filter(
     self, export_job_filter: str
 ) -> List[ProductExportConfig]:
     """Returns the export configs for the given export_job_filter,
     which can be either state_code or export job name."""
     filter_uppercase = export_job_filter.upper()
     if StateCode.is_state_code(filter_uppercase):
         return [
             export
             for export in self.get_all_export_configs()
             if export["state_code"] == filter_uppercase
         ]
     return [
         export
         for export in self.get_all_export_configs()
         if export["export_job_name"] == filter_uppercase
     ]
 def _get_state_metric_calculations(
         pipelines: List[YAMLDict],
         frequency: str) -> Dict[str, List[PipelineMetricInfo]]:
     """Returns a dict of state names to lists of info about their regularly
     calculated metrics."""
     state_metric_calculations = defaultdict(list)
     for pipeline in pipelines:
         state_metric_calculations[str(
             StateCode(pipeline.peek("state_code", str)).get_state()
         )].extend([
             PipelineMetricInfo(
                 name=metric,
                 month_count=pipeline.peek_optional(
                     "calculation_month_count", int),
                 frequency=frequency,
             ) for metric in pipeline.peek("metric_types", str).split()
         ], )
     return state_metric_calculations
Example #28
0
def download_data(
    state_code: states.StateCode,
    system: schema.System,
    base_drive_folder_id: str,
    base_local_directory: str,
    credentials_directory: str,
) -> None:
    local_directory = os.path.join(base_local_directory, state_code.value,
                                   system.value)
    os.makedirs(local_directory, exist_ok=True)

    drive = Drive(credentials_directory)

    state_folder = drive.get_folder(state_code.get_state().name,
                                    base_drive_folder_id)
    corrections_folder = drive.get_folder(system.value.title(),
                                          state_folder.id)
    data_folder = drive.get_folder("Data", corrections_folder.id)
    drive.download_data(data_folder.id, local_directory=local_directory)
Example #29
0
    def _pause_direct_ingest_instance() -> Tuple[str, HTTPStatus]:
        try:
            state_code = StateCode(request.json["stateCode"])
            ingest_instance = DirectIngestInstance(
                request.json["ingestInstance"])
        except ValueError:
            return "invalid parameters provided", HTTPStatus.BAD_REQUEST

        ingest_status_manager = DirectIngestInstanceStatusManager(
            region_code=state_code.value, ingest_instance=ingest_instance)
        try:
            ingest_status_manager.pause_instance()
        except Exception:
            return (
                "something went wrong pausing the intance",
                HTTPStatus.INTERNAL_SERVER_ERROR,
            )

        return "", HTTPStatus.OK
Example #30
0
    def __init__(
        self,
        project_id: str,
        region: str,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
        file_filter: Optional[str],
    ):

        self.project_id = project_id
        self.region = region
        self.state_code = StateCode(region.upper())
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region(
            region_code=region,
            system_level=SystemLevel.STATE,
            # Raw files are only ever stored in the PRIMARY storage bucket
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id=self.project_id,
        )
        self.ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
            region_code=region,
            system_level=SystemLevel.STATE,
            # Raw files are only ever processed in the PRIMARY ingest bucket
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id=self.project_id,
        )

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )