コード例 #1
0
    def test_process_file_that_needs_splitting(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=True)

        # Set line limit to 1
        controller.file_split_line_limit = 1

        # pylint:disable=protected-access
        file_tags = list(sorted(controller._get_file_tag_rank_list()))

        add_paths_with_tags_and_process(self,
                                        controller,
                                        file_tags,
                                        pre_normalize_filename=True)

        processed_split_file_paths = defaultdict(list)
        for path in controller.fs.all_paths:
            if self._path_in_split_file_storage_subdir(path, controller):
                file_tag = filename_parts_from_path(path).file_tag
                processed_split_file_paths[file_tag].append(path)

        self.assertEqual(1, len(processed_split_file_paths.keys()))
        self.assertEqual(2, len(processed_split_file_paths['tagC']))

        found_suffixes = {
            filename_parts_from_path(p).filename_suffix
            for p in processed_split_file_paths['tagC']
        }
        self.assertEqual(found_suffixes,
                         {'00001_file_split_size1', '00002_file_split_size1'})
コード例 #2
0
    def setUp(self) -> None:
        self.maxDiff = 250000

        self.metadata_patcher = patch("recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = "recidiviz-staging"

        local_postgres_helpers.use_on_disk_postgresql_database(
            self.schema_base())
        local_postgres_helpers.use_on_disk_postgresql_database(OperationsBase)

        self.controller = build_gcsfs_controller_for_tests(
            self.controller_cls(),
            self.fixture_path_prefix(),
            run_async=False,
            max_delay_sec_between_files=0,
            regions_module=regions,
        )

        # Set entity matching error threshold to a diminishingly small number
        # for tests. We cannot set it to 0 because we throw when errors *equal*
        # the error threshold.
        self.entity_matching_error_threshold_patcher = patch.dict(
            "recidiviz.persistence.persistence.SYSTEM_TYPE_TO_ERROR_THRESHOLD",
            {
                SystemLevel.STATE: {
                    OVERALL_THRESHOLD: 0,
                    ENUM_THRESHOLD: 0,
                    ENTITY_MATCHING_THRESHOLD: 0,
                    DATABASE_INVARIANT_THRESHOLD: 0,
                }
            },
        )

        self.entity_matching_error_threshold_patcher.start()
コード例 #3
0
    def test_next_schedule_runs_before_process_job_clears(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        self.assertIsInstance(
            controller.cloud_task_manager,
            FakeSynchronousDirectIngestCloudTaskManager,
            "Expected FakeSynchronousDirectIngestCloudTaskManager")
        task_manager = controller.cloud_task_manager

        file_path = \
            path_for_fixture_file(controller, f'tagA.csv',
                                  should_normalize=False)
        file_path2 = \
            path_for_fixture_file(controller, f'tagB.csv',
                                  should_normalize=False)
        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        controller.fs.test_add_path(file_path)
        controller.fs.test_add_path(file_path2)

        # At this point we have a series of tasks handling / renaming /
        # splitting the new files, then scheduling the next job. They run in
        # quick succession.
        while task_manager.scheduler_tasks:
            task_manager.test_run_next_scheduler_task()
            task_manager.test_pop_finished_scheduler_task()

        # Process job tasks starts as a result of the first schedule.
        task_manager.test_run_next_process_job_task()

        task_manager.test_run_next_scheduler_task()
        task_manager.test_pop_finished_process_job_task()
        task_manager.test_pop_finished_scheduler_task()

        # We should have still queued a process job, even though the last
        # one hadn't run when schedule executes
        task_manager.test_run_next_process_job_task()
        task_manager.test_pop_finished_process_job_task()

        task_manager.test_run_next_scheduler_task()
        task_manager.test_pop_finished_scheduler_task()

        self.assertEqual(
            0,
            task_manager.get_scheduler_queue_info(controller.region).size())
        self.assertEqual(
            0,
            task_manager.get_process_job_queue_info(controller.region).size())
コード例 #4
0
    def setUp(self) -> None:
        self.maxDiff = 250000

        local_postgres_helpers.use_on_disk_postgresql_database(
            self.schema_base())
        local_postgres_helpers.use_on_disk_postgresql_database(OperationsBase)

        self.controller = build_gcsfs_controller_for_tests(
            self.controller_cls(),
            self.fixture_path_prefix(),
            run_async=False,
            max_delay_sec_between_files=0)
コード例 #5
0
    def test_do_not_queue_same_job_twice(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        self.assertIsInstance(
            controller.cloud_task_manager,
            FakeSynchronousDirectIngestCloudTaskManager,
            "Expected FakeSynchronousDirectIngestCloudTaskManager")
        task_manager = controller.cloud_task_manager

        file_path = \
            path_for_fixture_file(controller, f'tagA.csv',
                                  should_normalize=False)
        file_path2 = \
            path_for_fixture_file(controller, f'tagB.csv',
                                  should_normalize=False)
        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        controller.fs.test_add_path(file_path)

        while task_manager.scheduler_tasks:
            task_manager.test_run_next_scheduler_task()
            task_manager.test_pop_finished_scheduler_task()

        controller.fs.test_add_path(file_path2)

        # Task for handling unnormalized file_path2
        task_manager.test_run_next_scheduler_task()
        task_manager.test_pop_finished_scheduler_task()

        task_manager.test_run_next_process_job_task()
        task_manager.test_pop_finished_process_job_task()

        # This is the task that got queued by after we normalized the path,
        # which will schedule the next process_job.
        task_manager.test_run_next_scheduler_task()
        task_manager.test_pop_finished_scheduler_task()

        # This is the task that got queued by finishing a job
        task_manager.test_run_next_scheduler_task()
        task_manager.test_pop_finished_scheduler_task()

        self.assertEqual(
            0,
            task_manager.get_scheduler_queue_info(controller.region).size())
        self.assertEqual(
            1,
            task_manager.get_process_job_queue_info(controller.region).size())
コード例 #6
0
    def test_process_already_normalized_paths(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=True)

        # pylint:disable=protected-access
        file_tags = list(sorted(controller._get_file_tag_rank_list()))

        add_paths_with_tags_and_process(self,
                                        controller,
                                        file_tags,
                                        pre_normalize_filename=True)
コード例 #7
0
    def test_cloud_function_fails_on_new_file_rename_later_with_cron(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        self.assertIsInstance(
            controller.cloud_task_manager,
            FakeSynchronousDirectIngestCloudTaskManager,
            "Expected FakeSynchronousDirectIngestCloudTaskManager")
        task_manager = controller.cloud_task_manager

        file_path = \
            path_for_fixture_file(controller, f'tagA.csv',
                                  should_normalize=False)
        file_path2 = \
            path_for_fixture_file(controller, f'tagB.csv',
                                  should_normalize=False)
        file_path3 = \
            path_for_fixture_file(controller, f'tagC.csv',
                                  should_normalize=False)

        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        # Upload new files without triggering the controller
        controller.fs.test_add_path(file_path, fail_handle_file_call=True)
        controller.fs.test_add_path(file_path2, fail_handle_file_call=True)
        controller.fs.test_add_path(file_path3, fail_handle_file_call=True)

        self.assertEqual(
            0,
            task_manager.get_scheduler_queue_info(controller.region).size())
        self.assertEqual(
            0,
            task_manager.get_process_job_queue_info(controller.region).size())

        for path in controller.fs.all_paths:
            self.assertFalse(controller.fs.is_normalized_file_path(path))

        # Cron job to handle unseen files triggers later
        controller.cloud_task_manager. \
            create_direct_ingest_handle_new_files_task(
                controller.region, can_start_ingest=False)

        run_task_queues_to_empty(controller)

        for path in controller.fs.all_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
    def setUp(self) -> None:
        self.maxDiff = 250000
        fakes.use_in_memory_sqlite_database(self.schema_base())

        self.controller = build_gcsfs_controller_for_tests(
            self.controller_cls(),
            self.fixture_path_prefix(),
            run_async=False,
            max_delay_sec_between_files=0)

        # Set entity matching error threshold to a diminishingly small number
        # for tests. We cannot set it to 0 because we throw when errors *equal*
        # the error threshold.
        self.entity_matching_error_threshold_patcher = patch(
            'recidiviz.persistence.persistence.ERROR_THRESHOLD', pow(1, -10))
        self.entity_matching_error_threshold_patcher.start()
コード例 #9
0
    def run_async_file_order_test_for_controller_cls(self, controller_cls):
        """Writes all expected files to the mock fs, then kicks the controller
        and ensures that all jobs are run to completion in the proper order."""

        controller = build_gcsfs_controller_for_tests(controller_cls,
                                                      self.FIXTURE_PATH_PREFIX,
                                                      run_async=True)

        # pylint:disable=protected-access
        file_tags = list(reversed(sorted(
            controller._get_file_tag_rank_list())))

        add_paths_with_tags_and_process(self, controller, file_tags)

        self.assertIsInstance(controller,
                              BaseTestCsvGcsfsDirectIngestController)
        self.assertFalse(controller.has_temp_paths_in_disk())
コード例 #10
0
    def test_move_files_from_previous_days_to_storage(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)

        previous_date = '2019-09-15'
        file_path_from_prev_day = path_for_fixture_file(
            controller,
            f'tagB.csv',
            should_normalize=True,
            dt=datetime.datetime.fromisoformat(previous_date))

        # pylint:disable=protected-access
        processed_file_from_prev_day = \
            controller.fs._to_processed_file_path(file_path_from_prev_day)

        unexpected_file_path_from_prev_day = path_for_fixture_file(
            controller,
            f'Unexpected_Tag.csv',
            should_normalize=True,
            dt=datetime.datetime.fromisoformat(previous_date))

        controller.fs.test_add_path(processed_file_from_prev_day)
        controller.fs.test_add_path(unexpected_file_path_from_prev_day)

        # pylint:disable=protected-access
        file_tags = list(sorted(controller._get_file_tag_rank_list()))

        # This will test that all paths get moved to storage,
        # except the unexpected tag.
        add_paths_with_tags_and_process(self,
                                        controller,
                                        file_tags,
                                        unexpected_tags=['Unexpected_Tag'])

        paths_from_prev_date = []
        for path in controller.fs.all_paths:
            expected_storage_dir_str = os.path.join(
                controller.storage_directory_path.abs_path(), previous_date)
            if path.abs_path().startswith(expected_storage_dir_str):
                paths_from_prev_date.append(path)

        self.assertTrue(len(paths_from_prev_date), 1)
        self.assertTrue('tagB' in paths_from_prev_date[0].abs_path())
コード例 #11
0
    def test_state_unexpected_tag(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=True)

        file_tags = ['tagA', 'Unexpected_Tag', 'tagB', 'tagC']
        unexpected_tags = ['Unexpected_Tag']

        add_paths_with_tags_and_process(self, controller, file_tags,
                                        unexpected_tags)

        split_paths = {
            path
            for path in controller.fs.all_paths
            if controller.fs.is_split_file(path)
        }
        self.assertFalse(split_paths)
コード例 #12
0
    def test_cloud_function_fails_on_new_file(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        self.assertIsInstance(
            controller.cloud_task_manager,
            FakeSynchronousDirectIngestCloudTaskManager,
            "Expected FakeSynchronousDirectIngestCloudTaskManager")
        task_manager = controller.cloud_task_manager

        file_path = \
            path_for_fixture_file(controller, f'tagA.csv',
                                  should_normalize=False)
        file_path2 = \
            path_for_fixture_file(controller, f'tagB.csv',
                                  should_normalize=False)
        file_path3 = \
            path_for_fixture_file(controller, f'tagC.csv',
                                  should_normalize=False)

        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        # Upload two new files without triggering the controller
        controller.fs.test_add_path(file_path, fail_handle_file_call=True)
        controller.fs.test_add_path(file_path2, fail_handle_file_call=True)

        self.assertEqual(
            0,
            task_manager.get_scheduler_queue_info(controller.region).size())
        self.assertEqual(
            0,
            task_manager.get_process_job_queue_info(controller.region).size())

        # Later file that succeeds will trigger proper upload of all files
        controller.fs.test_add_path(file_path3)

        run_task_queues_to_empty(controller)
        check_all_paths_processed(self,
                                  controller, ['tagA', 'tagB', 'tagC'],
                                  unexpected_tags=[])
コード例 #13
0
    def setUp(self) -> None:
        self.maxDiff = 250000

        self.metadata_patcher = patch("recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = "recidiviz-staging"

        self.main_database_key = self._main_database_key()
        self.operations_database_key = SQLAlchemyDatabaseKey.for_schema(
            SchemaType.OPERATIONS)
        local_postgres_helpers.use_on_disk_postgresql_database(
            self.main_database_key)
        local_postgres_helpers.use_on_disk_postgresql_database(
            self.operations_database_key)

        self.controller = build_gcsfs_controller_for_tests(
            self.controller_cls(),
            ingest_instance=self._main_ingest_instance(),
            run_async=False,
            regions_module=regions,
        )

        # Set entity matching error threshold to a diminishingly small number
        # for tests. We cannot set it to 0 because we throw when errors *equal*
        # the error threshold.
        self.entity_matching_error_threshold_patcher = patch.dict(
            "recidiviz.persistence.persistence.SYSTEM_TYPE_TO_ERROR_THRESHOLD",
            {
                SystemLevel.STATE: {
                    OVERALL_THRESHOLD: 0,
                    ENUM_THRESHOLD: 0,
                    ENTITY_MATCHING_THRESHOLD: 0,
                    DATABASE_INVARIANT_THRESHOLD: 0,
                }
            },
        )

        self.entity_matching_error_threshold_patcher.start()

        for instance in DirectIngestInstance:
            DirectIngestInstanceStatusManager.add_instance(
                self.region_code(), instance,
                (instance != self._main_ingest_instance()))
コード例 #14
0
    def setUp(self) -> None:
        self.maxDiff = 250000

        # TODO(3289): Fix hanging state table queries so we can use an on-disk postgres DB for the State/Jails schemas
        # as well. Currently, using postgres for StateBase causes a hang when we go t drop the tables in
        # stop_and_clear_on_disk_postgresql_database()
        fakes.use_in_memory_sqlite_database(self.schema_base())
        fakes.use_on_disk_postgresql_database(OperationsBase)

        self.controller = build_gcsfs_controller_for_tests(
            self.controller_cls(),
            self.fixture_path_prefix(),
            run_async=False,
            max_delay_sec_between_files=0)

        # Set entity matching error threshold to a diminishingly small number
        # for tests. We cannot set it to 0 because we throw when errors *equal*
        # the error threshold.
        self.entity_matching_error_threshold_patcher = patch(
            'recidiviz.persistence.persistence.ERROR_THRESHOLD', pow(1, -10))
        self.entity_matching_error_threshold_patcher.start()
コード例 #15
0
    def test_failing_to_process_a_file_that_needs_splitting_no_loop(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        self.assertIsInstance(
            controller.cloud_task_manager,
            FakeSynchronousDirectIngestCloudTaskManager,
            "Expected FakeSynchronousDirectIngestCloudTaskManager")
        task_manager = controller.cloud_task_manager

        # Set line limit to 1
        controller.file_split_line_limit = 1

        # This file exceeds the split limit, but since we add it with
        # fail_handle_file_call=True, it won't get picked up and split.
        file_path = path_for_fixture_file(
            controller,
            f'tagC.csv',
            should_normalize=True,
            dt=datetime.datetime.fromisoformat('2019-09-19'))
        controller.fs.test_add_path(file_path, fail_handle_file_call=True)

        controller.kick_scheduler(just_finished_job=False)

        task_manager.test_run_next_scheduler_task()
        task_manager.test_pop_finished_scheduler_task()

        task_manager.test_run_next_process_job_task()
        task_manager.test_pop_finished_process_job_task()

        # The process job task, which will try to process a file that is too
        # big, will not schedule another job for the same file (which would
        # just get us in a loop).
        self.assertEqual(
            0,
            task_manager.get_scheduler_queue_info(controller.region).size())
        self.assertEqual(
            0,
            task_manager.get_process_job_queue_info(controller.region).size())
コード例 #16
0
    def test_do_not_schedule_more_than_one_delayed_scheduler_job(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        self.assertIsInstance(
            controller.cloud_task_manager,
            FakeSynchronousDirectIngestCloudTaskManager,
            "Expected FakeSynchronousDirectIngestCloudTaskManager")
        task_manager = controller.cloud_task_manager

        path = path_for_fixture_file(controller,
                                     f'tagB.csv',
                                     should_normalize=False)

        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        # This will kick the scheduler once
        controller.fs.test_add_path(path)

        # Kick the scheduler 5 times
        for _ in range(5):
            controller.kick_scheduler(just_finished_job=False)

        # We have now queued 6 immediate jobs
        for _ in range(6):
            task_manager.test_run_next_scheduler_task()
            task_manager.test_pop_finished_scheduler_task()

        # But after running all those jobs, there should only be one job in
        # the queue.
        self.assertEqual(
            1,
            task_manager.get_scheduler_queue_info(controller.region).size())
        self.assertEqual(
            0,
            task_manager.get_process_job_queue_info(controller.region).size())
コード例 #17
0
    def test_processing_continues_if_there_are_subfolders_in_ingest_dir(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)

        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        subdir_path = \
            path_for_fixture_file(controller, f'subdir/',
                                  should_normalize=False)
        paths = [
            subdir_path,
            path_for_fixture_file(controller,
                                  f'subdir/Unexpected_Tag.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagA.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagB.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagC.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'subdir/tagC_2.csv',
                                  should_normalize=False),
        ]

        for path in paths:
            controller.fs.test_add_path(path)

        run_task_queues_to_empty(controller)

        dir_paths_found = []
        storage_file_paths = []
        ingest_file_paths = []

        for path in controller.fs.all_paths:
            if isinstance(path, GcsfsDirectoryPath):
                dir_paths_found.append(path)
                continue

            if path.abs_path().startswith(
                    controller.storage_directory_path.abs_path()):
                storage_file_paths.append(path)
            else:
                self.assertTrue(path.abs_path().startswith(
                    controller.ingest_directory_path.abs_path()))
                ingest_file_paths.append(path)

        self.assertEqual(1, len(dir_paths_found))
        self.assertEqual(subdir_path, dir_paths_found[0])

        self.assertEqual(3, len(storage_file_paths))
        storage_tags = {
            filename_parts_from_path(path).file_tag
            for path in storage_file_paths
        }
        self.assertEqual({'tagA', 'tagB', 'tagC'}, storage_tags)

        for path in storage_file_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
            self.assertTrue(controller.fs.is_processed_file(path))

        self.assertEqual(2, len(ingest_file_paths))
        ingest_tags = {
            filename_parts_from_path(path).file_tag
            for path in ingest_file_paths
        }
        self.assertEqual({'tagC', 'Unexpected_Tag'}, ingest_tags)

        for path in ingest_file_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
            self.assertTrue(controller.fs.is_seen_unprocessed_file(path))
            self.assertEqual(subdir_path,
                             GcsfsDirectoryPath.from_file_path(path))
コード例 #18
0
    def test_move_files_from_previous_days_to_storage_incomplete_current_day(
            self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        previous_date = '2019-09-15'
        current_date = '2019-09-16'

        file_path_from_prev_day = path_for_fixture_file(
            controller,
            f'tagB.csv',
            should_normalize=True,
            dt=datetime.datetime.fromisoformat(previous_date))

        # pylint:disable=protected-access
        processed_file_from_prev_day = \
            controller.fs._to_processed_file_path(file_path_from_prev_day)

        unexpected_file_path_from_prev_day = path_for_fixture_file(
            controller,
            f'Unexpected_Tag.csv',
            should_normalize=True,
            dt=datetime.datetime.fromisoformat(previous_date))

        file_path_from_current_day = path_for_fixture_file(
            controller,
            f'tagA.csv',
            should_normalize=True,
            dt=datetime.datetime.fromisoformat(current_date))

        controller.fs.test_add_path(processed_file_from_prev_day)
        controller.fs.test_add_path(unexpected_file_path_from_prev_day)
        controller.fs.test_add_path(file_path_from_current_day)

        run_task_queues_to_empty(controller)

        self.assertTrue(len(controller.fs.all_paths), 3)

        storage_paths = []
        processed_paths = []
        for path in controller.fs.all_paths:
            if self._path_in_storage_dir(path, controller):
                if 'Unexpected_Tag' in path.abs_path():
                    self.fail('Unexpected tag found in storage dir')
                storage_paths.append(path)
            if controller.fs.is_processed_file(path):
                processed_paths.append(path)

        self.assertEqual(len(storage_paths), 1)

        expected_storage_dir_str = os.path.join(
            controller.storage_directory_path.abs_path(), previous_date)
        self.assertTrue(
            storage_paths[0].abs_path().startswith(expected_storage_dir_str))

        # Path that is moved retains its 'processed_' prefix.
        self.assertEqual(len(processed_paths), 2)

        processed_paths_not_in_storage = \
            [path
             for path in processed_paths
             if not self._path_in_storage_dir(path, controller)]

        self.assertEqual(len(processed_paths_not_in_storage), 1)

        processed_path_str = processed_paths_not_in_storage[0].abs_path()
        self.assertTrue(
            processed_path_str.startswith(
                controller.ingest_directory_path.abs_path()))
        self.assertTrue('tagA' in processed_path_str)