def setUp(self) -> None: self.project_id = "recidiviz-456" self.project_id_patcher = patch("recidiviz.utils.metadata.project_id") self.project_id_patcher.start().return_value = self.project_id self.test_region = fake_region( region_code="us_xx", are_raw_data_bq_imports_enabled_in_env=True) self.region_module_patcher = patch.object( direct_ingest_raw_table_migration_collector, "regions", new=controller_fixtures, ) self.region_module_patcher.start() self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem()) self.ingest_directory_path = GcsfsDirectoryPath( bucket_name="direct/controllers/fixtures") self.temp_output_path = GcsfsDirectoryPath(bucket_name="temp_bucket") self.region_raw_file_config = DirectIngestRegionRawFileConfig( region_code="us_xx", yaml_config_file_dir=fixtures.as_filepath("us_xx"), ) self.mock_big_query_client = create_autospec(BigQueryClient) self.num_lines_uploaded = 0 self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = ( self.mock_import_raw_file_to_big_query) self.import_manager = DirectIngestRawFileImportManager( region=self.test_region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client, ) self.import_manager.csv_reader = _TestSafeGcsCsvReader( self.fs.gcs_file_system) self.time_patcher = patch( "recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time" ) self.mock_time = self.time_patcher.start() def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference: return bigquery.DatasetReference(project=self.project_id, dataset_id=dataset_id) self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
def build_path(bucket_template: str, state: str, pdf_name: str) -> GcsfsFilePath: return GcsfsFilePath.from_directory_and_file_name( GcsfsDirectoryPath(bucket_template.format(metadata.project_id()), state), pdf_name, )
def state_aggregate() -> Tuple[str, HTTPStatus]: """Calls state aggregates""" bucket = get_str_param_value("bucket", request.args) state = get_str_param_value("state", request.args) filename = get_str_param_value("filename", request.args) project_id = metadata.project_id() logging.info("The project id is %s", project_id) if not bucket or not state or not filename: raise StateAggregateError("All of state, bucket, and filename must be provided") directory_path = GcsfsDirectoryPath(bucket, state) path = GcsfsFilePath.from_directory_and_file_name(directory_path, filename) parser = STATE_TO_PARSER[state] fs = GcsfsFactory.build() logging.info("The path to download from is %s", path) logging.info("The files in the directory are:") logging.info( fs.ls_with_blob_prefix( bucket_name=directory_path.bucket_name, blob_prefix=directory_path.relative_path, ) ) # Providing a stream buffer to tabula reader does not work because it # tries to load the file into the local filesystem, since appengine is a # read only filesystem (except for the tmpdir) we download the file into # the local tmpdir and pass that in. handle = fs.download_to_temp_file(path) if not handle: raise StateAggregateError(f"Unable to download file: {path}") logging.info("Successfully downloaded file from gcs: %s", handle.local_file_path) result = parser(handle.local_file_path) logging.info("Successfully parsed the report") for table, df in result.items(): dao.write_df(table, df) # If we are successful, we want to move the file out of the cloud # function triggered directory, and into the historical path. historical_path = GcsfsFilePath.from_directory_and_file_name( GcsfsDirectoryPath(HISTORICAL_BUCKET.format(project_id), state), filename ) fs.mv(path, historical_path) return "", HTTPStatus.OK
class TestDirectIngestGcsFileSystem(TestCase): """Tests for the FakeGCSFileSystem.""" STORAGE_DIR_PATH = GcsfsDirectoryPath(bucket_name='storage_bucket', relative_path='region_subdir') INGEST_DIR_PATH = GcsfsDirectoryPath(bucket_name='my_bucket') def setUp(self) -> None: self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem()) def fully_process_file(self, dt: datetime.datetime, path: GcsfsFilePath, file_type_differentiation_on: bool = False) -> None: """Mimics all the file system calls for a single file in the direct ingest system, from getting added to the ingest bucket, turning to a processed file, then getting moved to storage.""" fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) start_num_total_files = len(self.fs.gcs_file_system.all_paths) # pylint: disable=protected-access start_ingest_paths = self.fs._ls_with_file_prefix( self.INGEST_DIR_PATH, '', None) start_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', None) if file_type_differentiation_on: start_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) start_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: start_raw_storage_paths = [] start_ingest_view_storage_paths = [] # File is renamed to normalized path file_type = GcsfsDirectIngestFileType.RAW_DATA \ if file_type_differentiation_on else GcsfsDirectIngestFileType.UNSPECIFIED self.fs.mv_path_to_normalized_path(path, file_type, dt) if file_type_differentiation_on: raw_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(len(raw_unprocessed), 1) self.assertTrue( self.fs.is_seen_unprocessed_file(raw_unprocessed[0])) # ... raw file imported to BQ processed_path = self.fs.mv_path_to_processed_path( raw_unprocessed[0]) processed = self.fs.get_processed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.fs.copy( processed_path, GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( processed_path.abs_path(), file_type_override=GcsfsDirectIngestFileType. INGEST_VIEW))) self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH) ingest_unprocessed_filter = GcsfsDirectIngestFileType.INGEST_VIEW if file_type_differentiation_on else None ingest_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=ingest_unprocessed_filter) self.assertEqual(len(ingest_unprocessed), 1) self.assertTrue(self.fs.is_seen_unprocessed_file( ingest_unprocessed[0])) # ... file is ingested # File is moved to processed path self.fs.mv_path_to_processed_path(ingest_unprocessed[0]) processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.assertTrue(self.fs.is_processed_file(processed[0])) unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(unprocessed), 0) # File is moved to storage ingest_move_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \ if file_type_differentiation_on else None self.fs.mv_processed_paths_before_date_to_storage( self.INGEST_DIR_PATH, self.STORAGE_DIR_PATH, date_str_bound=dt.date().isoformat(), include_bound=True, file_type_filter=ingest_move_type_filter) end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH, '', file_type_filter=None) end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) if file_type_differentiation_on: end_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) end_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: end_raw_storage_paths = [] end_ingest_view_storage_paths = [] # Each file gets re-exported as ingest view splitting_factor = 2 if file_type_differentiation_on else 1 expected_final_total_files = start_num_total_files + splitting_factor - 1 self.assertEqual(len(self.fs.gcs_file_system.all_paths), expected_final_total_files) self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1) self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1 * splitting_factor) if file_type_differentiation_on: self.assertEqual( len(end_raw_storage_paths) + len(end_ingest_view_storage_paths), len(end_storage_paths)) self.assertEqual(len(end_raw_storage_paths), len(start_raw_storage_paths) + 1) self.assertEqual(len(end_ingest_view_storage_paths), len(start_ingest_view_storage_paths) + 1) for sp in end_storage_paths: parts = filename_parts_from_path(sp) if sp.abs_path() not in { p.abs_path() for p in start_storage_paths }: self.assertTrue(sp.abs_path().startswith( self.STORAGE_DIR_PATH.abs_path())) dir_path, storage_file_name = os.path.split(sp.abs_path()) if parts.file_type != GcsfsDirectIngestFileType.UNSPECIFIED: self.assertTrue(parts.file_type.value in dir_path) name, _ = path.file_name.split('.') self.assertTrue(name in storage_file_name) def test_direct_ingest_file_moves(self) -> None: self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) def test_direct_ingest_multiple_file_moves(self) -> None: self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv')) def test_move_to_storage_with_conflict(self) -> None: dt = datetime.datetime.now() self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 2) found_first_file = False found_second_file = False for path in storage_paths: self.assertTrue(filename_parts_from_path(path)) if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file) def test_direct_ingest_file_moves_with_file_types(self) -> None: self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) def test_direct_ingest_multiple_file_moves_with_file_types(self) -> None: self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'), file_type_differentiation_on=True) def test_move_to_storage_with_conflict_with_file_types(self) -> None: dt = datetime.datetime.now() self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 4) found_first_file = False found_second_file = False for path in storage_paths: if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)