def test_raw_up_to_date_view(self) -> None:
        view = DirectIngestRawDataTableUpToDateView(
            region_code="us_xx",
            raw_file_config=DirectIngestRawFileConfig(
                file_tag="table_name",
                file_path="path/to/file.yaml",
                file_description="file description",
                primary_key_cols=["col1"],
                columns=[
                    RawTableColumnInfo(name="col1",
                                       is_datetime=False,
                                       description="col1 description"),
                    RawTableColumnInfo(name="col2",
                                       is_datetime=True,
                                       description="col2 description"),
                    RawTableColumnInfo(name="undocumented_column",
                                       is_datetime=True,
                                       description=None),
                ],
                supplemental_order_by_clause="",
                encoding="any-encoding",
                separator="@",
                ignore_quotes=False,
                always_historical_export=False,
            ),
        )

        self.assertEqual(self.PROJECT_ID, view.project)
        self.assertEqual("us_xx_raw_data_up_to_date_views", view.dataset_id)
        self.assertEqual("table_name_by_update_date", view.table_id)
        self.assertEqual("table_name_by_update_date", view.view_id)

        expected_datetime_cols_clause = """
        COALESCE(
            CAST(SAFE_CAST(col2 AS DATETIME) AS STRING),
            CAST(SAFE_CAST(SAFE.PARSE_DATE('%m/%d/%y', col2) AS DATETIME) AS STRING),
            CAST(SAFE_CAST(SAFE.PARSE_DATE('%m/%d/%Y', col2) AS DATETIME) AS STRING),
            CAST(SAFE_CAST(SAFE.PARSE_TIMESTAMP('%Y-%m-%d %H:%M', col2) AS DATETIME) AS STRING),
            CAST(SAFE_CAST(SAFE.PARSE_TIMESTAMP('%m/%d/%Y %H:%M:%S', col2) AS DATETIME) AS STRING),
            col2
        ) AS col2"""

        expected_view_query = RAW_DATA_UP_TO_DATE_VIEW_QUERY_TEMPLATE.format(
            project_id=self.PROJECT_ID,
            raw_table_primary_key_str="col1",
            raw_table_dataset_id="us_xx_raw_data",
            raw_table_name="table_name",
            columns_clause=f"col1, {expected_datetime_cols_clause}",
            legacy_except_clause="",
            legacy_datetime_cols_clause="",
            supplemental_order_by_clause="",
        )

        self.assertEqual(expected_view_query, view.view_query)
        self.assertEqual(
            "SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.table_name_by_update_date`",
            view.select_query,
        )
    def test_raw_latest_historical_file_view(self) -> None:
        view = DirectIngestRawDataTableLatestView(
            region_code="us_xx",
            raw_file_config=DirectIngestRawFileConfig(
                file_tag="table_name",
                file_path="path/to/file.yaml",
                file_description="file description",
                primary_key_cols=["col1", "col2"],
                columns=[
                    RawTableColumnInfo(name="col1",
                                       is_datetime=False,
                                       description="col1 description"),
                    RawTableColumnInfo(name="col2",
                                       is_datetime=False,
                                       description="col2 description"),
                ],
                supplemental_order_by_clause="CAST(seq_num AS INT64)",
                encoding="any-encoding",
                separator="@",
                ignore_quotes=False,
                always_historical_export=True,
            ),
            dataset_overrides=None,
        )

        self.assertEqual(self.PROJECT_ID, view.project)
        self.assertEqual("us_xx_raw_data_up_to_date_views", view.dataset_id)
        self.assertEqual("table_name_latest", view.table_id)
        self.assertEqual("table_name_latest", view.view_id)

        expected_view_query = (
            RAW_DATA_LATEST_HISTORICAL_FILE_VIEW_QUERY_TEMPLATE.format(
                project_id=self.PROJECT_ID,
                raw_table_primary_key_str="col1, col2",
                raw_table_dataset_id="us_xx_raw_data",
                raw_table_name="table_name",
                columns_clause="col1, col2",
                legacy_except_clause="",
                legacy_datetime_cols_clause="",
                supplemental_order_by_clause=", CAST(seq_num AS INT64)",
            ))

        self.assertEqual(expected_view_query, view.view_query)
        self.assertEqual(
            "SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.table_name_latest`",
            view.select_query,
        )
def _get_columns_by_file(
        state_code: str,
        project_id: str) -> Dict[str, List[RawTableColumnInfo]]:
    """Creates a list of RawTableColumnInfo for each raw file in a given state"""
    columns_by_file: Dict[str, List[RawTableColumnInfo]] = {}

    raw_data_dataset = f"{state_code.lower()}_raw_data"

    query_string = f"""
SELECT
 * EXCEPT(is_generated, generation_expression, is_stored, is_updatable)
FROM
 `{project_id}.{raw_data_dataset}.INFORMATION_SCHEMA.COLUMNS`
ORDER BY
  table_name ASC, ordinal_position ASC
"""

    bq_client = BigQueryClientImpl()
    query_job = bq_client.run_query_async(query_string)
    for row in query_job:
        column_name = row["column_name"]
        if column_name in {"file_id", "update_datetime"}:
            continue

        file_name = row["table_name"]
        is_datetime = row["data_type"].upper() == "DATETIME"

        if file_name not in columns_by_file:
            columns_by_file[file_name] = []

        column_info = RawTableColumnInfo(name=column_name,
                                         is_datetime=is_datetime,
                                         description="TKTK")
        columns_by_file[file_name].append(column_info)

    return columns_by_file
    def test_parse_yaml(self) -> None:
        region_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )
        self.assertEqual(7, len(region_config.raw_file_configs))
        self.assertEqual(
            {
                "file_tag_first",
                "file_tag_second",
                "tagC",
                "tagFullHistoricalExport",
                "tagInvalidCharacters",
                "tagNormalizationConflict",
                "tagPipeSeparatedNonUTF8",
            },
            region_config.raw_file_configs.keys(),
        )

        config_1 = region_config.raw_file_configs["file_tag_first"]
        self.assertEqual("file_tag_first", config_1.file_tag)
        self.assertEqual("First raw file.", config_1.file_description)
        self.assertEqual(["col_name_1a", "col_name_1b"],
                         config_1.primary_key_cols)
        self.assertEqual("ISO-456-7", config_1.encoding)
        self.assertEqual(",", config_1.separator)
        expected_column2_description = (
            "A column description that is long enough to take up\nmultiple lines. This"
            " text block will be interpreted\nliterally and trailing/leading whitespace"
            " is removed.")
        expected_columns_config_1 = [
            RawTableColumnInfo(name="col_name_1a",
                               is_datetime=False,
                               description="First column."),
            RawTableColumnInfo(
                name="col_name_1b",
                is_datetime=False,
                description=expected_column2_description,
            ),
            RawTableColumnInfo(name="undocumented_column",
                               is_datetime=False,
                               description=None),
        ]
        self.assertEqual(expected_columns_config_1, config_1.columns)

        config_2 = region_config.raw_file_configs["file_tag_second"]
        expected_file_description_config_2 = (
            "Some special/unusual character's in the description &\nlong enough to"
            " make a second line!\\n Trailing/leading white\nspace is stripped & the"
            " text block is interpreted literally.")
        self.assertEqual("file_tag_second", config_2.file_tag)
        self.assertEqual(expected_file_description_config_2,
                         config_2.file_description)
        self.assertEqual(["col_name_2a"], config_2.primary_key_cols)
        self.assertEqual("UTF-8", config_2.encoding)
        self.assertEqual("$", config_2.separator)
        self.assertEqual(
            [
                RawTableColumnInfo(
                    name="col_name_2a",
                    is_datetime=False,
                    description="column description",
                )
            ],
            config_2.columns,
        )

        config_3 = region_config.raw_file_configs["tagC"]
        self.assertEqual("tagC", config_3.file_tag)
        self.assertEqual("tagC file description", config_3.file_description)
        self.assertEqual(["COL1"], config_3.primary_key_cols)
        self.assertEqual("UTF-8", config_3.encoding)
        self.assertEqual(",", config_3.separator)
        self.assertEqual(
            [
                RawTableColumnInfo(
                    name="COL1", is_datetime=False, description=None)
            ],
            config_3.columns,
        )

        config_4 = region_config.raw_file_configs["tagPipeSeparatedNonUTF8"]
        self.assertEqual("tagPipeSeparatedNonUTF8", config_4.file_tag)
        self.assertEqual(["PRIMARY_COL1"], config_4.primary_key_cols)
        self.assertEqual("ISO-8859-1", config_4.encoding)
        self.assertEqual("|", config_4.separator)
Ejemplo n.º 5
0
 def _get_raw_data_file_configs(
         self) -> Dict[str, DirectIngestRawFileConfig]:
     return {
         "tagA":
         DirectIngestRawFileConfig(
             file_tag="tagA",
             file_path="path/to/tagA.yaml",
             file_description="file description",
             primary_key_cols=["mockKey"],
             columns=[
                 RawTableColumnInfo(
                     name="mockKey",
                     description="mockKey description",
                     is_datetime=False,
                 )
             ],
             supplemental_order_by_clause="",
             encoding="UTF-8",
             separator=",",
             custom_line_terminator=None,
             ignore_quotes=False,
             always_historical_export=False,
         ),
         "tagB":
         DirectIngestRawFileConfig(
             file_tag="tagB",
             file_path="path/to/tagB.yaml",
             file_description="file description",
             primary_key_cols=["mockKey"],
             columns=[
                 RawTableColumnInfo(
                     name="mockKey",
                     description="mockKey description",
                     is_datetime=False,
                 )
             ],
             supplemental_order_by_clause="",
             encoding="UTF-8",
             separator=",",
             custom_line_terminator=None,
             ignore_quotes=False,
             always_historical_export=False,
         ),
         "tagC":
         DirectIngestRawFileConfig(
             file_tag="tagC",
             file_path="path/to/tagC.yaml",
             file_description="file description",
             primary_key_cols=["mockKey"],
             columns=[
                 RawTableColumnInfo(
                     name="mockKey",
                     description="mockKey description",
                     is_datetime=False,
                 )
             ],
             supplemental_order_by_clause="",
             encoding="UTF-8",
             separator=",",
             custom_line_terminator=None,
             ignore_quotes=False,
             always_historical_export=False,
         ),
         "tagWeDoNotIngest":
         DirectIngestRawFileConfig(
             file_tag="tagWeDoNotIngest",
             file_path="path/to/tagWeDoNotIngest.yaml",
             file_description="file description",
             primary_key_cols=[],
             columns=[],
             supplemental_order_by_clause="",
             encoding="UTF-8",
             separator=",",
             custom_line_terminator=None,
             ignore_quotes=False,
             always_historical_export=False,
         ),
     }
    def test_parse_yaml(self) -> None:
        region_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            region_module=fake_regions_module,
        )
        self.assertEqual(13, len(region_config.raw_file_configs))
        self.assertEqual(
            {
                "file_tag_first",
                "file_tag_second",
                "tagC",
                "tagColCapsDoNotMatchConfig",
                "tagFullHistoricalExport",
                "tagInvalidCharacters",
                "tagNormalizationConflict",
                "tagCustomLineTerminatorNonUTF8",
                "tagPipeSeparatedNonUTF8",
                "tagDoubleDaggerWINDOWS1252",
                "tagColumnsMissing",
                "tagRowExtraColumns",
                "tagRowMissingColumns",
            },
            set(region_config.raw_file_configs.keys()),
        )

        config_1 = region_config.raw_file_configs["file_tag_first"]
        self.assertEqual("file_tag_first", config_1.file_tag)
        self.assertEqual("First raw file.", config_1.file_description)
        self.assertEqual(["col_name_1a", "col_name_1b"],
                         config_1.primary_key_cols)
        self.assertEqual("ISO-456-7", config_1.encoding)
        self.assertEqual(",", config_1.separator)
        self.assertIsNone(config_1.custom_line_terminator)
        expected_column2_description = (
            "A column description that is long enough to take up\nmultiple lines. This"
            " text block will be interpreted\nliterally and trailing/leading whitespace"
            " is removed.")
        expected_columns_config_1 = [
            RawTableColumnInfo(
                name="col_name_1a",
                is_datetime=False,
                description="First column.",
                known_values=[
                    ColumnEnumValueInfo(value="A",
                                        description="A description"),
                    ColumnEnumValueInfo(value="B", description=None),
                ],
            ),
            RawTableColumnInfo(
                name="col_name_1b",
                is_datetime=False,
                description=expected_column2_description,
            ),
            RawTableColumnInfo(name="undocumented_column",
                               is_datetime=False,
                               description=None),
        ]
        self.assertEqual(expected_columns_config_1, config_1.columns)

        config_2 = region_config.raw_file_configs["file_tag_second"]
        expected_file_description_config_2 = (
            "Some special/unusual character's in the description &\nlong enough to"
            " make a second line!\\n Trailing/leading white\nspace is stripped & the"
            " text block is interpreted literally.")
        self.assertEqual("file_tag_second", config_2.file_tag)
        self.assertEqual(expected_file_description_config_2,
                         config_2.file_description)
        self.assertEqual(["col_name_2a"], config_2.primary_key_cols)
        self.assertEqual("UTF-8", config_2.encoding)
        self.assertEqual("$", config_2.separator)
        self.assertEqual(
            [
                RawTableColumnInfo(
                    name="col_name_2a",
                    is_datetime=False,
                    description="column description",
                )
            ],
            config_2.columns,
        )

        config_3 = region_config.raw_file_configs["tagC"]
        self.assertEqual("tagC", config_3.file_tag)
        self.assertEqual("tagC file description", config_3.file_description)
        self.assertEqual(["COL1"], config_3.primary_key_cols)
        self.assertEqual("UTF-8", config_3.encoding)
        self.assertEqual(",", config_3.separator)
        self.assertEqual(
            [
                RawTableColumnInfo(name="COL1",
                                   is_datetime=False,
                                   description=None,
                                   known_values=None)
            ],
            config_3.columns,
        )

        config_4 = region_config.raw_file_configs["tagPipeSeparatedNonUTF8"]
        self.assertEqual("tagPipeSeparatedNonUTF8", config_4.file_tag)
        self.assertEqual(["PRIMARY_COL1"], config_4.primary_key_cols)
        self.assertEqual("ISO-8859-1", config_4.encoding)
        self.assertEqual("|", config_4.separator)