Beispiel #1
0
    def _run_ingest_job_for_filename(self, filename: str) -> None:
        get_region_patcher = patch(
            "recidiviz.persistence.entity_matching.state."
            "base_state_matching_delegate.get_region")
        mock_get_region = get_region_patcher.start()
        mock_get_region.return_value = self._fake_region()

        environ_patcher = patch.dict(
            'os.environ', {'PERSIST_LOCALLY': 'true'}
        )
        environ_patcher.start()

        file_path = path_for_fixture_file(self.controller,
                                          filename,
                                          should_normalize=False)

        if not isinstance(self.controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(self.controller.fs)}]")

        self.controller.fs.test_add_path(file_path)

        run_task_queues_to_empty(self.controller)

        get_region_patcher.stop()
        environ_patcher.stop()
Beispiel #2
0
    def test_cloud_function_fails_on_new_file_rename_later_with_cron(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        self.assertIsInstance(
            controller.cloud_task_manager,
            FakeSynchronousDirectIngestCloudTaskManager,
            "Expected FakeSynchronousDirectIngestCloudTaskManager")
        task_manager = controller.cloud_task_manager

        file_path = \
            path_for_fixture_file(controller, f'tagA.csv',
                                  should_normalize=False)
        file_path2 = \
            path_for_fixture_file(controller, f'tagB.csv',
                                  should_normalize=False)
        file_path3 = \
            path_for_fixture_file(controller, f'tagC.csv',
                                  should_normalize=False)

        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        # Upload new files without triggering the controller
        controller.fs.test_add_path(file_path, fail_handle_file_call=True)
        controller.fs.test_add_path(file_path2, fail_handle_file_call=True)
        controller.fs.test_add_path(file_path3, fail_handle_file_call=True)

        self.assertEqual(
            0,
            task_manager.get_scheduler_queue_info(controller.region).size())
        self.assertEqual(
            0,
            task_manager.get_process_job_queue_info(controller.region).size())

        for path in controller.fs.all_paths:
            self.assertFalse(controller.fs.is_normalized_file_path(path))

        # Cron job to handle unseen files triggers later
        controller.cloud_task_manager. \
            create_direct_ingest_handle_new_files_task(
                controller.region, can_start_ingest=False)

        run_task_queues_to_empty(controller)

        for path in controller.fs.all_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
    def _run_ingest_job_for_filename(self, filename: str) -> None:
        """Runs ingest for a the ingest view file with the given unnormalized file name."""
        get_region_patcher = patch(
            "recidiviz.persistence.entity_matching.state."
            "base_state_matching_delegate.get_region")
        mock_get_region = get_region_patcher.start()
        mock_get_region.return_value = self._fake_region()

        environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"})
        environ_patcher.start()

        file_type = (GcsfsDirectIngestFileType.INGEST_VIEW
                     if self.controller.region.
                     is_raw_vs_ingest_file_name_detection_enabled() else None)

        if not isinstance(self.controller.fs.gcs_file_system,
                          FakeGCSFileSystem):
            raise ValueError(
                f"Controller fs must have type "
                f"FakeGCSFileSystem. Found instead "
                f"type [{type(self.controller.fs.gcs_file_system)}]")

        if self.controller.region.are_ingest_view_exports_enabled_in_env():
            now = datetime.datetime.utcnow()
            yesterday = now - datetime.timedelta(days=1)
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=os.path.splitext(filename)[0],
                upper_bound_datetime_to_export=now,
                upper_bound_datetime_prev=yesterday,
            )

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            file_path = path_for_fixture_file(self.controller,
                                              filename,
                                              file_type=file_type,
                                              should_normalize=True)
            self.controller.fs.gcs_file_system.test_add_path(
                file_path, filename)

        run_task_queues_to_empty(self.controller)

        get_region_patcher.stop()
        environ_patcher.stop()
Beispiel #4
0
    def test_cloud_function_fails_on_new_file(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        self.assertIsInstance(
            controller.cloud_task_manager,
            FakeSynchronousDirectIngestCloudTaskManager,
            "Expected FakeSynchronousDirectIngestCloudTaskManager")
        task_manager = controller.cloud_task_manager

        file_path = \
            path_for_fixture_file(controller, f'tagA.csv',
                                  should_normalize=False)
        file_path2 = \
            path_for_fixture_file(controller, f'tagB.csv',
                                  should_normalize=False)
        file_path3 = \
            path_for_fixture_file(controller, f'tagC.csv',
                                  should_normalize=False)

        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        # Upload two new files without triggering the controller
        controller.fs.test_add_path(file_path, fail_handle_file_call=True)
        controller.fs.test_add_path(file_path2, fail_handle_file_call=True)

        self.assertEqual(
            0,
            task_manager.get_scheduler_queue_info(controller.region).size())
        self.assertEqual(
            0,
            task_manager.get_process_job_queue_info(controller.region).size())

        # Later file that succeeds will trigger proper upload of all files
        controller.fs.test_add_path(file_path3)

        run_task_queues_to_empty(controller)
        check_all_paths_processed(self,
                                  controller, ['tagA', 'tagB', 'tagC'],
                                  unexpected_tags=[])
    def _run_ingest_job_for_filename(self, filename: str) -> None:
        """Runs ingest for a the ingest view file with the given unnormalized file name."""

        environ_patcher = patch.dict("os.environ", {"PERSIST_LOCALLY": "true"})
        environ_patcher.start()
        file_type = GcsfsDirectIngestFileType.INGEST_VIEW

        if not isinstance(self.controller.fs.gcs_file_system,
                          FakeGCSFileSystem):
            raise ValueError(
                f"Controller fs must have type "
                f"FakeGCSFileSystem. Found instead "
                f"type [{type(self.controller.fs.gcs_file_system)}]")

        if self.controller.region.is_ingest_launched_in_env():
            now = datetime.datetime.now(tz=pytz.UTC)
            yesterday = now - datetime.timedelta(days=1)
            ingest_file_export_job_args = GcsfsIngestViewExportArgs(
                ingest_view_name=os.path.splitext(filename)[0],
                upper_bound_datetime_to_export=now,
                upper_bound_datetime_prev=yesterday,
                output_bucket_name=self.controller.ingest_bucket_path.
                bucket_name,
            )

            self.controller.file_metadata_manager.register_ingest_file_export_job(
                ingest_file_export_job_args)
            self.controller.ingest_view_export_manager.export_view_for_args(
                ingest_file_export_job_args)
        else:
            file_path = path_for_fixture_file(self.controller,
                                              filename,
                                              file_type=file_type,
                                              should_normalize=True)
            self.controller.fs.gcs_file_system.test_add_path(
                file_path, filename)

        run_task_queues_to_empty(self.controller)

        environ_patcher.stop()
Beispiel #6
0
    def test_processing_continues_if_there_are_subfolders_in_ingest_dir(self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)

        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        subdir_path = \
            path_for_fixture_file(controller, f'subdir/',
                                  should_normalize=False)
        paths = [
            subdir_path,
            path_for_fixture_file(controller,
                                  f'subdir/Unexpected_Tag.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagA.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagB.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'tagC.csv',
                                  should_normalize=False),
            path_for_fixture_file(controller,
                                  f'subdir/tagC_2.csv',
                                  should_normalize=False),
        ]

        for path in paths:
            controller.fs.test_add_path(path)

        run_task_queues_to_empty(controller)

        dir_paths_found = []
        storage_file_paths = []
        ingest_file_paths = []

        for path in controller.fs.all_paths:
            if isinstance(path, GcsfsDirectoryPath):
                dir_paths_found.append(path)
                continue

            if path.abs_path().startswith(
                    controller.storage_directory_path.abs_path()):
                storage_file_paths.append(path)
            else:
                self.assertTrue(path.abs_path().startswith(
                    controller.ingest_directory_path.abs_path()))
                ingest_file_paths.append(path)

        self.assertEqual(1, len(dir_paths_found))
        self.assertEqual(subdir_path, dir_paths_found[0])

        self.assertEqual(3, len(storage_file_paths))
        storage_tags = {
            filename_parts_from_path(path).file_tag
            for path in storage_file_paths
        }
        self.assertEqual({'tagA', 'tagB', 'tagC'}, storage_tags)

        for path in storage_file_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
            self.assertTrue(controller.fs.is_processed_file(path))

        self.assertEqual(2, len(ingest_file_paths))
        ingest_tags = {
            filename_parts_from_path(path).file_tag
            for path in ingest_file_paths
        }
        self.assertEqual({'tagC', 'Unexpected_Tag'}, ingest_tags)

        for path in ingest_file_paths:
            self.assertTrue(controller.fs.is_normalized_file_path(path))
            self.assertTrue(controller.fs.is_seen_unprocessed_file(path))
            self.assertEqual(subdir_path,
                             GcsfsDirectoryPath.from_file_path(path))
Beispiel #7
0
    def test_move_files_from_previous_days_to_storage_incomplete_current_day(
            self):
        controller = build_gcsfs_controller_for_tests(
            StateTestGcsfsDirectIngestController,
            self.FIXTURE_PATH_PREFIX,
            run_async=False)
        if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem):
            raise ValueError(f"Controller fs must have type "
                             f"FakeDirectIngestGCSFileSystem. Found instead "
                             f"type [{type(controller.fs)}]")

        previous_date = '2019-09-15'
        current_date = '2019-09-16'

        file_path_from_prev_day = path_for_fixture_file(
            controller,
            f'tagB.csv',
            should_normalize=True,
            dt=datetime.datetime.fromisoformat(previous_date))

        # pylint:disable=protected-access
        processed_file_from_prev_day = \
            controller.fs._to_processed_file_path(file_path_from_prev_day)

        unexpected_file_path_from_prev_day = path_for_fixture_file(
            controller,
            f'Unexpected_Tag.csv',
            should_normalize=True,
            dt=datetime.datetime.fromisoformat(previous_date))

        file_path_from_current_day = path_for_fixture_file(
            controller,
            f'tagA.csv',
            should_normalize=True,
            dt=datetime.datetime.fromisoformat(current_date))

        controller.fs.test_add_path(processed_file_from_prev_day)
        controller.fs.test_add_path(unexpected_file_path_from_prev_day)
        controller.fs.test_add_path(file_path_from_current_day)

        run_task_queues_to_empty(controller)

        self.assertTrue(len(controller.fs.all_paths), 3)

        storage_paths = []
        processed_paths = []
        for path in controller.fs.all_paths:
            if self._path_in_storage_dir(path, controller):
                if 'Unexpected_Tag' in path.abs_path():
                    self.fail('Unexpected tag found in storage dir')
                storage_paths.append(path)
            if controller.fs.is_processed_file(path):
                processed_paths.append(path)

        self.assertEqual(len(storage_paths), 1)

        expected_storage_dir_str = os.path.join(
            controller.storage_directory_path.abs_path(), previous_date)
        self.assertTrue(
            storage_paths[0].abs_path().startswith(expected_storage_dir_str))

        # Path that is moved retains its 'processed_' prefix.
        self.assertEqual(len(processed_paths), 2)

        processed_paths_not_in_storage = \
            [path
             for path in processed_paths
             if not self._path_in_storage_dir(path, controller)]

        self.assertEqual(len(processed_paths_not_in_storage), 1)

        processed_path_str = processed_paths_not_in_storage[0].abs_path()
        self.assertTrue(
            processed_path_str.startswith(
                controller.ingest_directory_path.abs_path()))
        self.assertTrue('tagA' in processed_path_str)