def test_get_unprocessed_raw_files_to_import(self) -> None: self.assertEqual( [], self.import_manager.get_unprocessed_raw_files_to_import()) raw_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename="file_tag_first.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename="file_tag_second.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.INGEST_VIEW, ) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, raw_unprocessed, has_fixture=False) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, ingest_view_unprocessed, has_fixture=False) self.assertEqual( [raw_unprocessed], self.import_manager.get_unprocessed_raw_files_to_import())
def test_import_wrong_separator_cols_do_not_parse(self) -> None: file_config = self.import_manager.region_raw_file_config.raw_file_configs[ "tagC"] updated_file_config = attr.evolve(file_config, separator="#") self.import_manager.region_raw_file_config.raw_file_configs[ "tagC"] = updated_file_config file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagC.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) with self.assertRaises(ValueError) as e: self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestRawFileMetadata)) self.assertTrue( str(e.exception).startswith( "Found only one column: [COL1__COL2_COL3]. " "Columns likely did not parse properly."))
def test_import_bq_file_with_raw_file_alternate_separator_and_encoding( self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagPipeSeparatedNonUTF8.txt', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) self.fs.test_add_path(file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual)) path = one(self.fs.uploaded_test_path_to_actual.keys()) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=f'gs://{path}', destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagPipeSeparatedNonUTF8', destination_table_schema=[ bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_import_bq_file_with_raw_file_invalid_column_chars(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagInvalidCharacters.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths)) path = one(self.fs.gcs_file_system.uploaded_paths) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagInvalidCharacters', destination_table_schema=[ bigquery.SchemaField('COL_1', 'STRING', 'NULLABLE'), bigquery.SchemaField('_COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('_3COL', 'STRING', 'NULLABLE'), bigquery.SchemaField('_4_COL', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) self.assertEqual(1, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_import_bq_file_with_raw_file_invalid_column_chars(self) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename="tagInvalidCharacters.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths)) path = one(self.fs.gcs_file_system.uploaded_paths) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, "us_xx_raw_data"), destination_table_id="tagInvalidCharacters", destination_table_schema=[ bigquery.SchemaField("COL_1", "STRING", "NULLABLE"), bigquery.SchemaField("_COL2", "STRING", "NULLABLE"), bigquery.SchemaField("_3COL", "STRING", "NULLABLE"), bigquery.SchemaField("_4_COL", "STRING", "NULLABLE"), bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"), bigquery.SchemaField("update_datetime", "DATETIME", "REQUIRED"), ], ) self.assertEqual(1, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_import_bq_file_with_unspecified_type_file(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.UNSPECIFIED) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata))
def test_import_bq_file_not_in_tags(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='this_path_tag_not_in_yaml.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata))
def test_import_bq_file_with_ingest_view_file(self) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename="file_tag_first.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.INGEST_VIEW, ) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata))
def test_get_unprocessed_raw_files_to_import(self): self.assertEqual( [], self.import_manager.get_unprocessed_raw_files_to_import()) raw_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_second.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.INGEST_VIEW) self.fs.test_add_path(raw_unprocessed) self.fs.test_add_path(ingest_view_unprocessed) self.assertEqual( [raw_unprocessed], self.import_manager.get_unprocessed_raw_files_to_import())
def test_import_bq_file_with_ingest_view_file(self) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="file_tag_first.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.INGEST_VIEW, ) with self.assertRaises(ValueError) as e: self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestRawFileMetadata)) self.assertEqual( str(e.exception), "Unexpected file type [GcsfsDirectIngestFileType.INGEST_VIEW] for " "path [file_tag_first].", )
def test_import_bq_file_not_in_tags(self) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="this_path_tag_not_in_yaml.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) with self.assertRaises(ValueError) as e: self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestRawFileMetadata)) self.assertEqual( str(e.exception), "Attempting to import raw file with tag [this_path_tag_not_in_yaml] " "unspecified by [us_xx] config.", )
def test_import_bq_file_with_raw_file_normalization_conflict(self): with self.assertRaises(ValueError) as e: file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagNormalizationConflict.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual( str(e.exception), "Multiple columns with name [_4COL] after normalization.")
def test_import_bq_file_multiple_chunks_uneven_division(self) -> None: self.import_manager.upload_chunk_size = 2 file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagPipeSeparatedNonUTF8.txt", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths)) expected_insert_calls = [ call( source_uri=uploaded_path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, "us_xx_raw_data"), destination_table_id="tagPipeSeparatedNonUTF8", destination_table_schema=[ bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"), bigquery.SchemaField("COL2", "STRING", "NULLABLE"), bigquery.SchemaField("COL3", "STRING", "NULLABLE"), bigquery.SchemaField("COL4", "STRING", "NULLABLE"), bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"), bigquery.SchemaField("update_datetime", "DATETIME", "REQUIRED"), ], ) for uploaded_path in self.fs.gcs_file_system.uploaded_paths ] self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls( expected_insert_calls, any_order=True) self.assertEqual( len(expected_insert_calls) - 1, self.mock_time.sleep.call_count) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_import_bq_file_with_migrations(self) -> None: file_datetime = migrations_tagC.DATE_1 file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagC.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, dt=file_datetime, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) mock_query_jobs = [ mock.MagicMock(), mock.MagicMock(), ] self.mock_big_query_client.run_query_async.side_effect = mock_query_jobs self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.mock_big_query_client.run_query_async.assert_has_calls([ mock.call( query_str="UPDATE `recidiviz-456.us_xx_raw_data.tagC` original\n" "SET COL1 = updates.new__COL1\n" "FROM (SELECT * FROM UNNEST([\n" " STRUCT('123' AS COL1, CAST('2020-06-10T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1),\n" " STRUCT('123' AS COL1, CAST('2020-09-21T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1)\n" "])) updates\n" "WHERE original.COL1 = updates.COL1 AND original.update_datetime = updates.update_datetime;" ), mock.call( query_str="DELETE FROM `recidiviz-456.us_xx_raw_data.tagC`\n" "WHERE STRUCT(COL1) IN (\n" " STRUCT('789')\n" ");"), ]) for mock_query_job in mock_query_jobs: mock_query_job.result.assert_called_once()
def test_import_bq_file_feature_not_released_throws(self): self.import_manager = DirectIngestRawFileImportManager( region=fake_region(region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=False), fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client) file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata))
def test_import_bq_file_with_row_extra_columns(self) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagRowExtraColumns.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) with self.assertRaisesRegex(ParserError, "Expected 4 fields in line 3, saw 5"): self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(0, len(self.fs.gcs_file_system.uploaded_paths)) self._check_no_temp_files_remain()
def test_import_bq_file_multiple_chunks_uneven_division(self): self.import_manager.upload_chunk_size = 2 file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagPipeSeparatedNonUTF8.txt', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths)) expected_insert_calls = [ call(source_uri=uploaded_path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagPipeSeparatedNonUTF8', destination_table_schema=[ bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) for uploaded_path in self.fs.gcs_file_system.uploaded_paths ] self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls( expected_insert_calls, any_order=True) self.assertEqual( len(expected_insert_calls) - 1, self.mock_time.sleep.call_count) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_import_bq_file_with_multibyte_raw_file_alternate_separator_and_encoding( self, ) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagDoubleDaggerWINDOWS1252.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths)) path = one(self.fs.gcs_file_system.uploaded_paths) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, "us_xx_raw_data"), destination_table_id="tagDoubleDaggerWINDOWS1252", destination_table_schema=[ bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"), bigquery.SchemaField("COL2", "STRING", "NULLABLE"), bigquery.SchemaField("COL3", "STRING", "NULLABLE"), bigquery.SchemaField("COL4", "STRING", "NULLABLE"), bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"), bigquery.SchemaField("update_datetime", "DATETIME", "REQUIRED"), ], ) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_import_bq_file_with_migrations(self) -> None: file_datetime = migrations_tagC.DATE_1 file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename="tagC.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, dt=file_datetime, ) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.mock_big_query_client.run_query_async.assert_has_calls([ mock.call( query_str= f"UPDATE `recidiviz-456.us_xx_raw_data.tagC` SET COL1 = '456' WHERE COL1 = '123' AND update_datetime = '{file_datetime.isoformat()}';" ), mock.call( query_str= "DELETE FROM `recidiviz-456.us_xx_raw_data.tagC` WHERE COL1 = '789';" ), ])