Exemple #1
0
def get_ingest_view_configs(
    region_code: str, ) -> List[DataDiscoveryStandardizedFileConfig]:
    """Collect ingest views for region; reads columns from their corresponding fixture csv"""
    if not StateCode.is_state_code(region_code):
        raise ValueError(
            f"Unknown region_code [{region_code}] received, must be a valid state code."
        )

    region_code = region_code.lower()

    views = DirectIngestPreProcessedIngestViewCollector(
        get_region(region_code, True), []).collect_view_builders()

    configs = []
    for view in views:
        try:
            # TODO(#6925) Infer columns from the mapping file rather than the fixture csv
            fixture_path = os.path.join(
                os.path.dirname(recidiviz.__file__),
                f"tests/ingest/direct/direct_ingest_fixtures/{region_code}/{view.ingest_view_name}.csv",
            )

            with open(fixture_path, "r") as f:
                columns = f.readline().split(",")
        except FileNotFoundError:
            continue

        standardized_config = DataDiscoveryStandardizedFileConfig(
            file_tag=view.ingest_view_name,
            columns=columns,
        )

        configs.append(standardized_config)

    return configs
Exemple #2
0
def _get_state_code_from_str(state_code_str: str) -> StateCode:
    if not StateCode.is_state_code(state_code_str):
        raise ValueError(
            f"Unknown region_code [{state_code_str}] received, must be a valid state code."
        )

    return StateCode[state_code_str.upper()]
Exemple #3
0
def main(argv: Optional[Sequence[str]] = None) -> int:
    """Generates direct ingest region documentation."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "filenames",
        nargs="*",
        help="Modified files to indicate which regions need their docs to be regenerated. "
        "Paths must be relative to the root of the repository. "
        "If none are provided, will use `git diff` to determine modified files.",
    )
    args = parser.parse_args(argv)

    # Arbitrary project ID - we just need to build views in order to obtain raw table dependencies
    with local_project_id_override(GCP_PROJECT_STAGING):
        modified = False
        touched_raw_data_regions = get_touched_raw_data_regions(args.filenames)
        for region_code in touched_raw_data_regions:
            if not StateCode.is_state_code(region_code):
                logging.info(
                    "Skipping raw data documentation for non-state region [%s]",
                    region_code,
                )
                continue
            logging.info(
                "Generating raw data documentation for region [%s]", region_code
            )
            modified |= generate_raw_data_documentation_for_region(region_code)
        if modified:
            update_summary_file(
                _create_ingest_catalog_summary(), "## State Ingest Catalog"
            )
        return 1 if modified else 0
Exemple #4
0
    def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]:
        """Generates documentation for all raw file configs for the given region and
        returns all of it as a combined string.

        Returns one Markdown-formatted string per raw file, mapped to its filename, as
        well as a header file with a table of contents.
        """
        region_config = DirectIngestRegionRawFileConfig(region_code=region_code)

        sorted_file_tags = sorted(region_config.raw_file_tags)

        if StateCode.is_state_code(region_code):
            state_code = StateCode(region_code.upper())
            state_name = state_code.get_state().name

            file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format(
                state_name=state_name, state_code_lower=state_code.value.lower()
            )
        else:
            file_header = ""

        raw_file_configs = [
            region_config.raw_file_configs[file_tag] for file_tag in sorted_file_tags
        ]

        config_paths_by_file_tag = {
            file_tag: file_config.file_path
            for file_tag, file_config in region_config.raw_file_configs.items()
        }

        file_tags_with_raw_file_configs = [
            raw_file_config.file_tag for raw_file_config in raw_file_configs
        ]

        region = regions.get_region(region_code=region_code, is_direct_ingest=True)

        view_collector = DirectIngestPreProcessedIngestViewCollector(region, [])
        views_by_raw_file = self.get_referencing_views(view_collector)
        touched_configs = self._get_touched_raw_data_configs(
            region_config.yaml_config_file_dir
        )

        raw_file_table = self._generate_raw_file_table(
            config_paths_by_file_tag,
            file_tags_with_raw_file_configs,
            views_by_raw_file,
            touched_configs,
        )

        docs_per_file: Dict[str, str] = {
            f"{config.file_tag}.md": self._generate_docs_for_raw_config(config)
            for config in raw_file_configs
        }

        docs_per_file[STATE_RAW_DATA_FILE_HEADER_PATH] = (
            file_header + "\n" + raw_file_table
        )

        return docs_per_file
Exemple #5
0
 def _ingest_lock_name_for_instance(self) -> str:
     if StateCode.is_state_code(self.region_code):
         return (
             STATE_GCS_TO_POSTGRES_INGEST_RUNNING_LOCK_PREFIX
             + self.region_code.upper()
             + f"_{self.ingest_instance.name}"
         )
     return (
         JAILS_GCS_TO_POSTGRES_INGEST_RUNNING_LOCK_PREFIX + self.region_code.upper()
     )
Exemple #6
0
    def generate_raw_file_docs_for_region(self, region_code: str) -> str:
        """Generates documentation for all raw file configs for the given region and returns all of it
        as a combined string."""
        region_config = DirectIngestRegionRawFileConfig(
            region_code=region_code)

        sorted_file_tags = sorted(region_config.raw_file_tags)

        if StateCode.is_state_code(region_code):
            state_code = StateCode(region_code.upper())
            state_name = state_code.get_state()

            file_header = STATE_RAW_DATA_FILE_HEADER_TEMPLATE.format(
                state_name=state_name,
                state_code_lower=state_code.value.lower())
        else:
            file_header = ""

        raw_file_configs = [
            region_config.raw_file_configs[file_tag]
            for file_tag in sorted_file_tags
        ]

        config_paths_by_file_tag = {
            file_tag: file_config.file_path
            for file_tag, file_config in
            region_config.raw_file_configs.items()
        }

        file_tags_with_raw_file_configs = [
            raw_file_config.file_tag for raw_file_config in raw_file_configs
        ]

        region = regions.get_region(region_code=region_code,
                                    is_direct_ingest=True)

        view_collector = DirectIngestPreProcessedIngestViewCollector(
            region, [])
        views_by_raw_file = self.get_referencing_views(view_collector)

        raw_file_table = self._generate_raw_file_table(
            config_paths_by_file_tag, file_tags_with_raw_file_configs,
            views_by_raw_file)

        docs_per_file = [
            self._generate_docs_for_raw_config(config)
            for config in raw_file_configs
        ]

        return file_header + "\n" + raw_file_table + "\n" + "\n\n".join(
            docs_per_file)
    def _get_product_enabled_states(self) -> Set[StateCode]:
        states: Set[str] = set()
        for product in self.products:
            if product.states is not None:
                states = states.union(
                    {state.state_code
                     for state in product.states})

        for state_code in states:
            if not StateCode.is_state_code(state_code):
                raise ValueError(
                    f"Found invalid state code value [{state_code}]"
                    f" in product config.")
        return {StateCode(state_code) for state_code in states}
    def for_region_code(cls, region_code: str,
                        is_direct_ingest: bool) -> "SystemLevel":
        if is_direct_ingest is None:
            raise ValueError(
                "Region flag is_direct_ingest is None, expected boolean value."
            )
        if not is_direct_ingest:
            # There are some scrapers that scrape state jails websites (e.g.
            # recidiviz/ingest/scrape/regions/us_pa/us_pa_scraper.py) which we always
            # write to the Vera county jails database.
            return SystemLevel.COUNTY

        if StateCode.is_state_code(region_code.upper()):
            return SystemLevel.STATE
        return SystemLevel.COUNTY
    def test_state_codes_match_terraform_config(self) -> None:
        yaml_path = os.path.join(
            os.path.dirname(deploy.__file__),
            "terraform",
            "direct_ingest_state_codes.yaml",
        )
        with open(yaml_path, "r") as ymlfile:
            region_codes_list = yaml.full_load(ymlfile)

        for region in self.region_dir_names:
            if not StateCode.is_state_code(region):
                continue
            self.assertTrue(
                region.upper() in region_codes_list,
                f"State [{region}] must be listed in [{yaml_path}]",
            )
Exemple #10
0
 def get_export_configs_for_job_filter(
     self, export_job_filter: str
 ) -> List[ProductExportConfig]:
     """Returns the export configs for the given export_job_filter,
     which can be either state_code or export job name."""
     filter_uppercase = export_job_filter.upper()
     if StateCode.is_state_code(filter_uppercase):
         return [
             export
             for export in self.get_all_export_configs()
             if export["state_code"] == filter_uppercase
         ]
     return [
         export
         for export in self.get_all_export_configs()
         if export["export_job_name"] == filter_uppercase
     ]
    def _get_dataflow_pipeline_enabled_states(self) -> Set[StateCode]:
        """Returns the set of StateCodes for all states present in our production calc
        pipeline template."""
        states = {
            pipeline.peek("state_code", str).upper()
            for pipeline in self.daily_pipelines
        }.union({
            pipeline.peek("state_code", str).upper()
            for pipeline in self.historical_pipelines
        })

        for state_code in states:
            if not StateCode.is_state_code(state_code):
                raise ValueError(
                    f"Found invalid state code value [{state_code}]"
                    f" in pipeline template config.")

        return {StateCode(state_code) for state_code in states}
    def _get_translated_key_column_mask(self) -> int:
        """Returns an integer mask to add to every primary/foreign key column in this
        query. The mask is stable across all tables and derived from the region code.

        Example: 46000000000000

        For the above mask, if a primary key is 123456 in Postgres, then the translated
        primary key would be 46000000123456.
        """
        if not self.region_code:
            raise ValueError(
                "Must have set region code to do primary/foreign key translation."
            )
        if not StateCode.is_state_code(self.region_code):
            raise ValueError(
                "No support yet for doing primary/foreign key translation on non-state "
                "regions.")
        # The FIPS code is always a two-digit code for states
        fips = int(StateCode(self.region_code).get_state().fips)
        return fips * pow(10, 12)
Exemple #13
0
def _create_ingest_catalog_summary() -> List[str]:
    """Creates the State Ingest Catalog portion of SUMMARY.md, as a list of lines."""
    ingest_catalog_states = sorted(
        [
            f.lower()
            for f in listdir(_INGEST_CATALOG_ROOT)
            if isdir(join(_INGEST_CATALOG_ROOT, f))
        ]
    )

    ingest_catalog_summary = ["## State Ingest Catalog\n\n"]

    for state in ingest_catalog_states:
        if StateCode.is_state_code(state):
            state_code = StateCode(state.upper())
            state_name = state_code.get_state()
        else:
            raise ValueError(
                f"Folder under {_INGEST_CATALOG_ROOT} named {state} is not a valid state code"
            )

        ingest_catalog_summary.extend(
            [
                f"- [{state_name}](ingest/{state}/{state}.md)\n",
                f"  - [Schema Mappings](ingest/{state}/schema_mappings.md)\n",
                f"  - [Raw Data Description](ingest/{state}/raw_data.md)\n",
            ]
        )

        raw_data_dir = join(_INGEST_CATALOG_ROOT, state, "raw_data")
        if not isdir(raw_data_dir):
            continue
        raw_data_files = sorted(
            [f for f in listdir(raw_data_dir) if isfile(join(raw_data_dir, f))]
        )

        for file_name in raw_data_files:
            ingest_catalog_summary.append(
                f"    - [{file_name[:-3]}](ingest/{state}/raw_data/{file_name})\n"
            )
    return ingest_catalog_summary
 def test_regions_are_clean(self) -> None:
     """Check that all existing region directories start with a valid state code."""
     for region in self.region_dir_names:
         self.test.assertTrue(StateCode.is_state_code(region[:5]))
Exemple #15
0
def _validate_region_code(region_code: str) -> None:
    if not StateCode.is_state_code(region_code.upper()):
        raise ValueError(
            f"Unknown region_code [{region_code}] received, must be a valid state code."
        )