def parsed_pdf(request): request.cls.parsed_pdf = tn_aggregate_ingest.parse( '', fixtures.as_filepath('_jailjanuary2019.pdf')) request.cls.parsed_female_pdf = tn_aggregate_ingest.parse( '', fixtures.as_filepath('_jailfemalejanuary2019.pdf')) request.cls.parsed_newer_pdf = tn_aggregate_ingest.parse( '', fixtures.as_filepath('_jailmarch2020.pdf'))
def setUpClass(cls) -> None: # Cache the parsed pdf between tests since it's expensive to compute cls.parsed_pdf = fl_aggregate_ingest.parse( fixtures.as_filepath("jails-2018-01.pdf")) cls.parsed_pdf_2 = fl_aggregate_ingest.parse( fixtures.as_filepath( "florida__pub_jails_2019_2019_06 june fcdf.pdf"))
def parsed_pdf(request): request.cls.parsed_pdf = fl_aggregate_ingest.parse( "", fixtures.as_filepath("jails-2018-01.pdf") ) request.cls.parsed_pdf_2 = fl_aggregate_ingest.parse( "", fixtures.as_filepath("florida__pub_jails_2019_2019_06 june fcdf.pdf") )
def parsed_pdf(request): request.cls.parsed_pdf = ky_aggregate_ingest.parse( "", fixtures.as_filepath("12-20-18.pdf")) request.cls.parsed_pdf_2 = ky_aggregate_ingest.parse( "", fixtures.as_filepath("08-23-18.pdf")) request.cls.parsed_pdf_3 = ky_aggregate_ingest.parse( "", fixtures.as_filepath("08-22-19.pdf"))
def setUpClass(cls) -> None: # Cache the parsed pdf between tests since it's expensive to compute cls.parsed_pdf = ny_aggregate_ingest.parse( fixtures.as_filepath("jail_population.pdf") ) cls.parsed_pdf_3_pages = ny_aggregate_ingest.parse( fixtures.as_filepath("jail_population_2019.pdf") )
def setUpClass(cls) -> None: # Cache the parsed pdfs between tests since it's expensive to compute cls.parsed_pdf = tn_aggregate_ingest.parse( fixtures.as_filepath("_jailjanuary2019.pdf")) cls.parsed_female_pdf = tn_aggregate_ingest.parse( fixtures.as_filepath("_jailfemalejanuary2019.pdf")) cls.parsed_newer_pdf = tn_aggregate_ingest.parse( fixtures.as_filepath("_jailmarch2020.pdf"))
def setUpClass(cls) -> None: # Cache the parsed pdf between tests since it's expensive to compute cls.parsed_pdf = ky_aggregate_ingest.parse( fixtures.as_filepath("12-20-18.pdf")) cls.parsed_pdf_2 = ky_aggregate_ingest.parse( fixtures.as_filepath("08-23-18.pdf")) cls.parsed_pdf_3 = ky_aggregate_ingest.parse( fixtures.as_filepath("08-22-19.pdf"))
def setUpClass(cls) -> None: # Cache the parsed pdf between tests since it's expensive to compute cls.parsed_pdf_before_1996 = tx_aggregate_ingest.parse( fixtures.as_filepath("abbreviated pop rpt march 1994.pdf")) cls.parsed_pdf_1996 = tx_aggregate_ingest.parse( fixtures.as_filepath( "texas_url_abbreviated pop rpt June 1996.pdf")) cls.parsed_pdf_after_1996 = tx_aggregate_ingest.parse( fixtures.as_filepath("Abbreviated Pop Rpt Dec 2017.pdf")) cls.parsed_pdf_concat = tx_aggregate_ingest.parse( fixtures.as_filepath( "docs_abbreviatedpopreports_abbreviated pop rpt oct 2003.pdf"))
def test_get_export_config_valid(self) -> None: product_configs = ProductConfigs.from_file( path=fixtures.as_filepath("fixture_products.yaml")) _export_config = product_configs.get_export_config( export_job_name="EXPORT", state_code="US_XX", )
def test_read_with_exception(self) -> None: class _TestException(ValueError): pass class _ExceptionDelegate(_TestGcsfsCsvReaderDelegate): def on_dataframe(self, encoding: str, chunk_num: int, df: pd.DataFrame) -> bool: should_continue = super().on_dataframe(encoding, chunk_num, df) if chunk_num > 0: raise _TestException("We crashed processing!") return should_continue file_path = fixtures.as_filepath("encoded_utf_8.csv") delegate = _ExceptionDelegate() with self.assertRaises(_TestException): self.reader.streaming_read( GcsfsFilePath.from_absolute_path(file_path), delegate=delegate, chunk_size=1, ) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual("UTF-8", delegate.encodings_attempted[0]) self.assertIsNone(delegate.successful_encoding) self.assertEqual(2, len(delegate.dataframes)) self.assertEqual({"UTF-8"}, {encoding for encoding, df in delegate.dataframes}) self.assertEqual(0, delegate.decode_errors) self.assertEqual(1, delegate.exceptions)
def test_direct_ingest_preprocessed_view_detect_row_deletion_unknown_pk_table_specified( self, ) -> None: region_config = DirectIngestRegionRawFileConfig( region_code="us_xx", yaml_config_file_dir=fixtures.as_filepath("us_xx"), ) view_query_template = """SELECT * FROM {file_tag_first} LEFT OUTER JOIN {tagFullHistoricalExport} USING (col1);""" with self.assertRaises(ValueError) as e: DirectIngestPreProcessedIngestView( ingest_view_name="ingest_view_tag", view_query_template=view_query_template, region_raw_table_config=region_config, order_by_cols="col1, col2", is_detect_row_deletion_view=True, primary_key_tables_for_entity_deletion=[ "tagFullHistoricalExport", "unknown", ], ) self.assertTrue( str(e.exception).startswith( "Ingest view ingest_view_tag has specified unknown in " "`primary_key_tables_for_entity_deletion`, but that " "raw file tag was not found as a dependency."))
def test_parse_empty_yaml_throws(self): with self.assertRaises(ValueError): _ = DirectIngestRegionRawFileConfig( region_code='us_xx', yaml_config_file_path=fixtures.as_filepath( 'empty_raw_data_files.yaml'), )
def test_happy_path(self) -> None: yaml_path = fixtures.as_filepath("schema_config.yaml") validation_schema_config = DatasetSchemaInfo.from_yaml(yaml_path) expected = DatasetSchemaInfo( dataset="fixture_schema", tables=[ TableSchemaInfo( table_name="incarceration_population_by_facility", columns=[ "date_of_stay", "facility", "month", "population_count", "region_code", "year", ], ), TableSchemaInfo( table_name="incarceration_population_person_level", columns=[ "date_of_stay", "facility", "person_external_id", "region_code", ], ), ], ) self.assertEqual(expected, validation_schema_config)
def _build( self, *, dataset_overrides: Optional[Dict[str, str]] = None ) -> DirectIngestPreProcessedIngestView: region_config = DirectIngestRegionRawFileConfig( region_code="us_xx", yaml_config_file_dir=fixtures.as_filepath("us_xx"), ) query = "select * from {file_tag_first} JOIN {tagFullHistoricalExport} USING (COL_1)" primary_key_tables_for_entity_deletion = ( [] if not self.is_detect_row_deletion_view else ["tagFullHistoricalExport"]) return DirectIngestPreProcessedIngestView( ingest_view_name=self.tag, view_query_template=query, region_raw_table_config=region_config, order_by_cols="colA, colC", is_detect_row_deletion_view=self.is_detect_row_deletion_view, primary_key_tables_for_entity_deletion= primary_key_tables_for_entity_deletion, materialize_raw_data_table_views=self. materialize_raw_data_table_views, )
def test_read_completely_empty_file(self) -> None: empty_file_path = fixtures.as_filepath("tagA.csv") delegate = _TestGcsfsCsvReaderDelegate() self.reader.streaming_read( GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1, ) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding) self.assertEqual(0, len(delegate.dataframes)) self.assertEqual(0, delegate.decode_errors) self.assertEqual(0, delegate.exceptions) delegate = _TestGcsfsCsvReaderDelegate() self.reader.streaming_read( GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10, ) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding) self.assertEqual(0, len(delegate.dataframes)) self.assertEqual(0, delegate.decode_errors) self.assertEqual(0, delegate.exceptions)
def test_direct_ingest_preprocessed_view_same_table_multiple_places(self): region_config = DirectIngestRegionRawFileConfig( region_code='us_xx', yaml_config_file_path=fixtures.as_filepath( 'us_xx_raw_data_files.yaml'), ) view_query_template = """SELECT * FROM {file_tag_first} LEFT OUTER JOIN {file_tag_first} USING (col1);""" view = DirectIngestPreProcessedIngestView( ingest_view_name='ingest_view_tag', view_query_template=view_query_template, region_raw_table_config=region_config) self.assertEqual( ['file_tag_first'], [c.file_tag for c in view.raw_table_dependency_configs]) expected_view_query = """WITH file_tag_first_generated_view AS ( SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.file_tag_first_latest` ) SELECT * FROM file_tag_first_generated_view LEFT OUTER JOIN file_tag_first_generated_view USING (col1);""" self.assertEqual(expected_view_query, view.view_query)
def test_direct_ingest_preprocessed_view_detect_row_deletion_no_historical_table( self, ) -> None: region_config = DirectIngestRegionRawFileConfig( region_code="us_xx", yaml_config_file_dir=fixtures.as_filepath("us_xx"), ) view_query_template = """SELECT * FROM {file_tag_first} LEFT OUTER JOIN {file_tag_second} USING (col1);""" with self.assertRaises(ValueError) as e: DirectIngestPreProcessedIngestView( ingest_view_name="ingest_view_tag", view_query_template=view_query_template, region_raw_table_config=region_config, order_by_cols="col1, col2", is_detect_row_deletion_view=True, primary_key_tables_for_entity_deletion=["file_tag_second"], ) self.assertTrue( str(e.exception).startswith( "Ingest view ingest_view_tag is marked as `is_detect_row_deletion_view` and has table file_tag_second " "specified in `primary_key_tables_for_entity_deletion`; however the raw data file is not marked as always " "being exported as historically."))
def test_read_file_with_columns_no_contents(self): empty_file_path = fixtures.as_filepath('tagB.csv') delegate = TestGcsfsCsvReaderDelegate() self.reader.streaming_read( GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding) self.assertEqual(1, len(delegate.dataframes)) encoding, df = delegate.dataframes[0] self.assertEqual(encoding, delegate.successful_encoding) self.assertEqual(0, df.shape[0]) # No rows self.assertEqual(7, df.shape[1]) # 7 columns self.assertEqual(0, delegate.decode_errors) self.assertEqual(0, delegate.exceptions) delegate = TestGcsfsCsvReaderDelegate() self.reader.streaming_read( GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding) self.assertEqual(1, len(delegate.dataframes)) encoding, df = delegate.dataframes[0] self.assertEqual(encoding, delegate.successful_encoding) self.assertEqual(0, df.shape[0]) # No rows self.assertEqual(7, df.shape[1]) # 7 columns self.assertEqual(0, delegate.decode_errors) self.assertEqual(0, delegate.exceptions)
def test_direct_ingest_preprocessed_view_detect_row_deletion_no_pk_tables_specified( self, ) -> None: region_config = DirectIngestRegionRawFileConfig( region_code="us_xx", yaml_config_file_dir=fixtures.as_filepath("us_xx"), ) view_query_template = """SELECT * FROM {file_tag_first} LEFT OUTER JOIN {tagFullHistoricalExport} USING (col1);""" with self.assertRaises(ValueError) as e: DirectIngestPreProcessedIngestView( ingest_view_name="ingest_view_tag", view_query_template=view_query_template, region_raw_table_config=region_config, order_by_cols="col1, col2", is_detect_row_deletion_view=True, primary_key_tables_for_entity_deletion=[], ) self.assertTrue( str(e.exception).startswith( "Ingest view ingest_view_tag was marked as `is_detect_row_deletion_view`; however no " "`primary_key_tables_for_entity_deletion` were defined."))
def test_direct_ingest_preprocessed_view_other_materialized_subquery_fails( self, ) -> None: region_config = DirectIngestRegionRawFileConfig( region_code="us_xx", yaml_config_file_dir=fixtures.as_filepath("us_xx"), ) view_query_template = """ CREATE TEMP TABLE my_subquery AS (SELECT * FROM {file_tag_first}); SELECT * FROM my_subquery;""" with self.assertRaises(ValueError) as e: _ = DirectIngestPreProcessedIngestView( ingest_view_name="ingest_view_tag", view_query_template=view_query_template, region_raw_table_config=region_config, order_by_cols="col1, col2", is_detect_row_deletion_view=False, primary_key_tables_for_entity_deletion=[], ) self.assertEqual( str(e.exception), "Found CREATE TEMP TABLE clause in this query - ingest views cannot contain CREATE clauses.", )
def _parsed_result() -> Dict[DeclarativeMeta, pd.DataFrame]: global _PARSED_RESULT if not _PARSED_RESULT: _PARSED_RESULT = pa_aggregate_ingest.parse( fixtures.as_filepath( "2018 County Statistics _ General Information - 2017 Data.xlsx" ), ) return _PARSED_RESULT
def test_get_export_config_missing_state_code(self) -> None: product_configs = ProductConfigs.from_file( path=fixtures.as_filepath("fixture_products.yaml")) with self.assertRaisesRegex( BadProductExportSpecificationError, "Missing required state_code parameter for export_job_name EXPORT", ): product_configs.get_export_config(export_job_name="EXPORT", )
def _get_JID() -> pd.DataFrame: global _JID if _JID is None: _JID = pd.read_csv(as_filepath("jid.csv", subdir="data_sets"), dtype={"fips": str}) return _JID
def _get_FIPS() -> pd.DataFrame: global _FIPS if _FIPS is None: _FIPS = pd.read_csv(as_filepath("fips.csv", subdir="data_sets"), dtype={"fips": str}) return _FIPS
def _get_FID() -> pd.DataFrame: global _FID if _FID is None: _FID = pd.read_csv(as_filepath("fid.csv", subdir="data_sets"), dtype={"vera_jid": str}) return _FID
def test_parse_no_defaults_throws(self) -> None: with self.assertRaises(ValueError) as e: _ = DirectIngestRegionRawFileConfig( region_code="us_yy", yaml_config_file_dir=fixtures.as_filepath("us_yy"), ) self.assertEqual(str(e.exception), "Missing default raw data configs for region: us_yy")
def _parsed_result() -> Dict[DeclarativeMeta, pd.DataFrame]: global _PARSED_RESULT if not _PARSED_RESULT: _PARSED_RESULT = ca_aggregate_ingest.parse( fixtures.as_filepath("QueryResult.xls") ) return _PARSED_RESULT
def test_direct_ingest_preprocessed_view_with_reference_table(self): region_config = DirectIngestRegionRawFileConfig( region_code='us_xx', yaml_config_file_path=fixtures.as_filepath( 'us_xx_raw_data_files.yaml'), ) view_query_template = """SELECT * FROM {file_tag_first} LEFT OUTER JOIN `{{project_id}}.reference_tables.my_table` USING (col1);""" view = DirectIngestPreProcessedIngestView( ingest_view_name='ingest_view_tag', view_query_template=view_query_template, region_raw_table_config=region_config, order_by_cols='col1, col2') self.assertEqual( ['file_tag_first'], [c.file_tag for c in view.raw_table_dependency_configs]) expected_view_query = """WITH file_tag_first_generated_view AS ( SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.file_tag_first_latest` ) SELECT * FROM file_tag_first_generated_view LEFT OUTER JOIN `recidiviz-456.reference_tables.my_table` USING (col1) ORDER BY col1, col2;""" self.assertEqual(expected_view_query, view.view_query) expected_date_parametrized_view_query = """WITH file_tag_first_generated_view AS ( WITH rows_with_recency_rank AS ( SELECT * EXCEPT (file_id, update_datetime), ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b ORDER BY update_datetime DESC) AS recency_rank FROM `recidiviz-456.us_xx_raw_data.file_tag_first` WHERE update_datetime <= @my_param ) SELECT * EXCEPT (recency_rank) FROM rows_with_recency_rank WHERE recency_rank = 1 ) SELECT * FROM file_tag_first_generated_view LEFT OUTER JOIN `recidiviz-456.reference_tables.my_table` USING (col1) ORDER BY col1, col2;""" self.assertEqual(expected_date_parametrized_view_query, view.date_parametrized_view_query('my_param'))
def test_missing_configs_for_region(self) -> None: with self.assertRaises(ValueError) as e: region_config = DirectIngestRegionRawFileConfig( region_code="us_xy", yaml_config_file_dir=fixtures.as_filepath("us_xy"), ) _configs = region_config.raw_file_configs self.assertEqual(str(e.exception), "Missing raw data configs for region: us_xy")
def test_get_export_config_too_many_exports(self) -> None: product_configs = ProductConfigs.from_file( path=fixtures.as_filepath("fixture_products.yaml")) product_configs.products.append(product_configs.products[0]) with self.assertRaisesRegex( BadProductExportSpecificationError, "Wrong number of products returned for export for export_job_name EXPORT", ): product_configs.get_export_config(export_job_name="EXPORT", )