def _run_ingest_job_for_filename(self, filename: str) -> None: get_region_patcher = patch( "recidiviz.persistence.entity_matching.state." "base_state_matching_delegate.get_region") mock_get_region = get_region_patcher.start() mock_get_region.return_value = self._fake_region() environ_patcher = patch.dict( 'os.environ', {'PERSIST_LOCALLY': 'true'} ) environ_patcher.start() file_path = path_for_fixture_file(self.controller, filename, should_normalize=False) if not isinstance(self.controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(self.controller.fs)}]") self.controller.fs.test_add_path(file_path) run_task_queues_to_empty(self.controller) get_region_patcher.stop() environ_patcher.stop()
def test_cloud_function_fails_on_new_file_rename_later_with_cron(self): controller = build_gcsfs_controller_for_tests( StateTestGcsfsDirectIngestController, self.FIXTURE_PATH_PREFIX, run_async=False) self.assertIsInstance( controller.cloud_task_manager, FakeSynchronousDirectIngestCloudTaskManager, "Expected FakeSynchronousDirectIngestCloudTaskManager") task_manager = controller.cloud_task_manager file_path = \ path_for_fixture_file(controller, f'tagA.csv', should_normalize=False) file_path2 = \ path_for_fixture_file(controller, f'tagB.csv', should_normalize=False) file_path3 = \ path_for_fixture_file(controller, f'tagC.csv', should_normalize=False) if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(controller.fs)}]") # Upload new files without triggering the controller controller.fs.test_add_path(file_path, fail_handle_file_call=True) controller.fs.test_add_path(file_path2, fail_handle_file_call=True) controller.fs.test_add_path(file_path3, fail_handle_file_call=True) self.assertEqual( 0, task_manager.get_scheduler_queue_info(controller.region).size()) self.assertEqual( 0, task_manager.get_process_job_queue_info(controller.region).size()) for path in controller.fs.all_paths: self.assertFalse(controller.fs.is_normalized_file_path(path)) # Cron job to handle unseen files triggers later controller.cloud_task_manager. \ create_direct_ingest_handle_new_files_task( controller.region, can_start_ingest=False) run_task_queues_to_empty(controller) for path in controller.fs.all_paths: self.assertTrue(controller.fs.is_normalized_file_path(path))
def _run_ingest_job_for_filename(self, filename: str) -> None: """Runs ingest for a the ingest view file with the given unnormalized file name.""" get_region_patcher = patch( "recidiviz.persistence.entity_matching.state." "base_state_matching_delegate.get_region") mock_get_region = get_region_patcher.start() mock_get_region.return_value = self._fake_region() environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"}) environ_patcher.start() file_type = (GcsfsDirectIngestFileType.INGEST_VIEW if self.controller.region. is_raw_vs_ingest_file_name_detection_enabled() else None) if not isinstance(self.controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError( f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(self.controller.fs.gcs_file_system)}]") if self.controller.region.are_ingest_view_exports_enabled_in_env(): now = datetime.datetime.utcnow() yesterday = now - datetime.timedelta(days=1) ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=os.path.splitext(filename)[0], upper_bound_datetime_to_export=now, upper_bound_datetime_prev=yesterday, ) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: file_path = path_for_fixture_file(self.controller, filename, file_type=file_type, should_normalize=True) self.controller.fs.gcs_file_system.test_add_path( file_path, filename) run_task_queues_to_empty(self.controller) get_region_patcher.stop() environ_patcher.stop()
def test_cloud_function_fails_on_new_file(self): controller = build_gcsfs_controller_for_tests( StateTestGcsfsDirectIngestController, self.FIXTURE_PATH_PREFIX, run_async=False) self.assertIsInstance( controller.cloud_task_manager, FakeSynchronousDirectIngestCloudTaskManager, "Expected FakeSynchronousDirectIngestCloudTaskManager") task_manager = controller.cloud_task_manager file_path = \ path_for_fixture_file(controller, f'tagA.csv', should_normalize=False) file_path2 = \ path_for_fixture_file(controller, f'tagB.csv', should_normalize=False) file_path3 = \ path_for_fixture_file(controller, f'tagC.csv', should_normalize=False) if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(controller.fs)}]") # Upload two new files without triggering the controller controller.fs.test_add_path(file_path, fail_handle_file_call=True) controller.fs.test_add_path(file_path2, fail_handle_file_call=True) self.assertEqual( 0, task_manager.get_scheduler_queue_info(controller.region).size()) self.assertEqual( 0, task_manager.get_process_job_queue_info(controller.region).size()) # Later file that succeeds will trigger proper upload of all files controller.fs.test_add_path(file_path3) run_task_queues_to_empty(controller) check_all_paths_processed(self, controller, ['tagA', 'tagB', 'tagC'], unexpected_tags=[])
def _run_ingest_job_for_filename(self, filename: str) -> None: """Runs ingest for a the ingest view file with the given unnormalized file name.""" environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"}) environ_patcher.start() file_type = GcsfsDirectIngestFileType.INGEST_VIEW if not isinstance(self.controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError( f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(self.controller.fs.gcs_file_system)}]") if self.controller.region.is_ingest_launched_in_env(): now = datetime.datetime.now(tz=pytz.UTC) yesterday = now - datetime.timedelta(days=1) ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=os.path.splitext(filename)[0], upper_bound_datetime_to_export=now, upper_bound_datetime_prev=yesterday, output_bucket_name=self.controller.ingest_bucket_path. bucket_name, ) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: file_path = path_for_fixture_file(self.controller, filename, file_type=file_type, should_normalize=True) self.controller.fs.gcs_file_system.test_add_path( file_path, filename) run_task_queues_to_empty(self.controller) environ_patcher.stop()
def test_processing_continues_if_there_are_subfolders_in_ingest_dir(self): controller = build_gcsfs_controller_for_tests( StateTestGcsfsDirectIngestController, self.FIXTURE_PATH_PREFIX, run_async=False) if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(controller.fs)}]") subdir_path = \ path_for_fixture_file(controller, f'subdir/', should_normalize=False) paths = [ subdir_path, path_for_fixture_file(controller, f'subdir/Unexpected_Tag.csv', should_normalize=False), path_for_fixture_file(controller, f'tagA.csv', should_normalize=False), path_for_fixture_file(controller, f'tagB.csv', should_normalize=False), path_for_fixture_file(controller, f'tagC.csv', should_normalize=False), path_for_fixture_file(controller, f'subdir/tagC_2.csv', should_normalize=False), ] for path in paths: controller.fs.test_add_path(path) run_task_queues_to_empty(controller) dir_paths_found = [] storage_file_paths = [] ingest_file_paths = [] for path in controller.fs.all_paths: if isinstance(path, GcsfsDirectoryPath): dir_paths_found.append(path) continue if path.abs_path().startswith( controller.storage_directory_path.abs_path()): storage_file_paths.append(path) else: self.assertTrue(path.abs_path().startswith( controller.ingest_directory_path.abs_path())) ingest_file_paths.append(path) self.assertEqual(1, len(dir_paths_found)) self.assertEqual(subdir_path, dir_paths_found[0]) self.assertEqual(3, len(storage_file_paths)) storage_tags = { filename_parts_from_path(path).file_tag for path in storage_file_paths } self.assertEqual({'tagA', 'tagB', 'tagC'}, storage_tags) for path in storage_file_paths: self.assertTrue(controller.fs.is_normalized_file_path(path)) self.assertTrue(controller.fs.is_processed_file(path)) self.assertEqual(2, len(ingest_file_paths)) ingest_tags = { filename_parts_from_path(path).file_tag for path in ingest_file_paths } self.assertEqual({'tagC', 'Unexpected_Tag'}, ingest_tags) for path in ingest_file_paths: self.assertTrue(controller.fs.is_normalized_file_path(path)) self.assertTrue(controller.fs.is_seen_unprocessed_file(path)) self.assertEqual(subdir_path, GcsfsDirectoryPath.from_file_path(path))
def test_move_files_from_previous_days_to_storage_incomplete_current_day( self): controller = build_gcsfs_controller_for_tests( StateTestGcsfsDirectIngestController, self.FIXTURE_PATH_PREFIX, run_async=False) if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(controller.fs)}]") previous_date = '2019-09-15' current_date = '2019-09-16' file_path_from_prev_day = path_for_fixture_file( controller, f'tagB.csv', should_normalize=True, dt=datetime.datetime.fromisoformat(previous_date)) # pylint:disable=protected-access processed_file_from_prev_day = \ controller.fs._to_processed_file_path(file_path_from_prev_day) unexpected_file_path_from_prev_day = path_for_fixture_file( controller, f'Unexpected_Tag.csv', should_normalize=True, dt=datetime.datetime.fromisoformat(previous_date)) file_path_from_current_day = path_for_fixture_file( controller, f'tagA.csv', should_normalize=True, dt=datetime.datetime.fromisoformat(current_date)) controller.fs.test_add_path(processed_file_from_prev_day) controller.fs.test_add_path(unexpected_file_path_from_prev_day) controller.fs.test_add_path(file_path_from_current_day) run_task_queues_to_empty(controller) self.assertTrue(len(controller.fs.all_paths), 3) storage_paths = [] processed_paths = [] for path in controller.fs.all_paths: if self._path_in_storage_dir(path, controller): if 'Unexpected_Tag' in path.abs_path(): self.fail('Unexpected tag found in storage dir') storage_paths.append(path) if controller.fs.is_processed_file(path): processed_paths.append(path) self.assertEqual(len(storage_paths), 1) expected_storage_dir_str = os.path.join( controller.storage_directory_path.abs_path(), previous_date) self.assertTrue( storage_paths[0].abs_path().startswith(expected_storage_dir_str)) # Path that is moved retains its 'processed_' prefix. self.assertEqual(len(processed_paths), 2) processed_paths_not_in_storage = \ [path for path in processed_paths if not self._path_in_storage_dir(path, controller)] self.assertEqual(len(processed_paths_not_in_storage), 1) processed_path_str = processed_paths_not_in_storage[0].abs_path() self.assertTrue( processed_path_str.startswith( controller.ingest_directory_path.abs_path())) self.assertTrue('tagA' in processed_path_str)