def build_gcsfs_controller_for_tests(
    controller_cls,
    fixture_path_prefix: str,
    run_async: bool,
    **kwargs,
) -> GcsfsDirectIngestController:
    """Builds an instance of |controller_cls| for use in tests with several internal classes mocked properly. """
    fake_fs = FakeDirectIngestGCSFileSystem()

    def mock_build_fs():
        return fake_fs

    if 'TestGcsfsDirectIngestController' in controller_cls.__name__:
        view_collector_cls: Type[BigQueryViewCollector] = \
            FakeDirectIngestPreProcessedIngestViewCollector
    else:
        view_collector_cls = DirectIngestPreProcessedIngestViewCollector

    with patch(
            f'{BaseDirectIngestController.__module__}.DirectIngestCloudTaskManagerImpl'
    ) as mock_task_factory_cls:
        with patch(
                f'{GcsfsDirectIngestController.__module__}.BigQueryClientImpl'
        ) as mock_big_query_client_cls:
            with patch(
                    f'{GcsfsDirectIngestController.__module__}.DirectIngestRawFileImportManager',
                    FakeDirectIngestRawFileImportManager):
                with patch(
                        f'{GcsfsDirectIngestController.__module__}.DirectIngestPreProcessedIngestViewCollector',
                        view_collector_cls):
                    task_manager = FakeAsyncDirectIngestCloudTaskManager() \
                        if run_async else FakeSynchronousDirectIngestCloudTaskManager()
                    mock_task_factory_cls.return_value = task_manager
                    mock_big_query_client_cls.return_value = \
                        FakeDirectIngestBigQueryClient(project_id=metadata.project_id(), fs=fake_fs)
                    with patch.object(GcsfsFactory, 'build',
                                      new=mock_build_fs):
                        controller = controller_cls(
                            ingest_directory_path=
                            f'{fixture_path_prefix}/fixtures',
                            storage_directory_path='storage/path',
                            **kwargs)
                        task_manager.set_controller(controller)
                        fake_fs.test_set_controller(controller)
                        return controller
Exemple #2
0
 def create_export_manager(self, region):
     metadata_manager = PostgresDirectIngestFileMetadataManager(region.region_code)
     return DirectIngestIngestViewExportManager(
         region=region,
         fs=FakeDirectIngestGCSFileSystem(),
         ingest_directory_path=GcsfsDirectoryPath.from_absolute_path('ingest_bucket'),
         big_query_client=self.mock_client,
         file_metadata_manager=metadata_manager,
         view_collector=_ViewCollector(region, controller_file_tags=['ingest_view']))
    def setUp(self) -> None:
        self.project_id = 'recidiviz-456'
        self.test_region = fake_region(
            region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True)
        self.fs = FakeDirectIngestGCSFileSystem()
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name='direct/controllers/fixtures')
        self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket')

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \
            self.mock_import_raw_file_to_big_query

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)
        self.import_manager.csv_reader = TestSafeGcsCsvReader(self.fs)

        self.time_patcher = patch(
            'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time'
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
class DirectIngestRawFileImportManagerTest(unittest.TestCase):
    """Tests for DirectIngestRawFileImportManager."""
    def setUp(self) -> None:
        self.project_id = 'recidiviz-456'
        self.test_region = fake_region(
            region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True)
        self.fs = FakeDirectIngestGCSFileSystem()
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name='direct/controllers/fixtures')
        self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket')

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \
            self.mock_import_raw_file_to_big_query

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)
        self.import_manager.csv_reader = TestSafeGcsCsvReader(self.fs)

        self.time_patcher = patch(
            'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time'
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref

    def tearDown(self) -> None:
        self.time_patcher.stop()

    def mock_import_raw_file_to_big_query(
            self, *, source_uri: str,
            destination_table_schema: List[bigquery.SchemaField], **_kwargs):
        col_names = [
            schema_field.name for schema_field in destination_table_schema
        ]
        temp_path = GcsfsFilePath.from_absolute_path(source_uri)
        local_temp_path = self.fs.uploaded_test_path_to_actual[
            temp_path.abs_path()]

        df = pd.read_csv(local_temp_path, header=None, dtype=str)
        for value in df.values:
            for cell in value:
                if isinstance(cell, str):
                    stripped_cell = cell.strip()
                    if stripped_cell != cell:
                        raise ValueError(
                            'Did not strip white space from raw data cell')

                if cell in col_names:
                    raise ValueError(
                        f'Wrote column row to output file: {value}')
        self.num_lines_uploaded += len(df)

        return mock.MagicMock()

    def _metadata_for_unprocessed_file_path(
            self, path: GcsfsFilePath) -> DirectIngestFileMetadata:
        parts = filename_parts_from_path(path)
        return DirectIngestFileMetadata(
            region_code=self.test_region.region_code,
            file_tag=parts.file_tag,
            file_id=123,
            processed_time=None)

    def _check_no_temp_files_remain(self):
        for path in self.fs.all_paths:
            if path.abs_path().startswith(self.temp_output_path.abs_path()):
                self.fail(
                    f'Expected temp path {path.abs_path()} to be cleaned up')

    def test_get_unprocessed_raw_files_to_import(self):
        self.assertEqual(
            [], self.import_manager.get_unprocessed_raw_files_to_import())

        raw_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_second.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW)

        self.fs.test_add_path(raw_unprocessed)
        self.fs.test_add_path(ingest_view_unprocessed)

        self.assertEqual(
            [raw_unprocessed],
            self.import_manager.get_unprocessed_raw_files_to_import())

    def test_import_bq_file_not_in_tags(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='this_path_tag_not_in_yaml.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_ingest_view_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_unspecified_type_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.UNSPECIFIED)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_feature_not_released_throws(self):
        self.import_manager = DirectIngestRawFileImportManager(
            region=fake_region(region_code='us_xx',
                               are_raw_data_bq_imports_enabled_in_env=False),
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_raw_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagC.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        self.fs.test_add_path(file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual))

        path = one(self.fs.uploaded_test_path_to_actual.keys())
        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=f'gs://{path}',
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagC',
            destination_table_schema=[
                bigquery.SchemaField('COL1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(2, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_alternate_separator_and_encoding(
            self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        self.fs.test_add_path(file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual))

        path = one(self.fs.uploaded_test_path_to_actual.keys())
        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=f'gs://{path}',
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagPipeSeparatedNonUTF8',
            destination_table_schema=[
                bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_even_division(self):

        self.import_manager.upload_chunk_size = 1

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        self.fs.test_add_path(file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(5, len(self.fs.uploaded_test_path_to_actual))

        expected_insert_calls = [
            call.insert_into_table_from_cloud_storage_async(
                source_uri=f'gs://{uploaded_path}',
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, 'us_xx_raw_data'),
                destination_table_id='tagPipeSeparatedNonUTF8',
                destination_table_schema=[
                    bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                    bigquery.SchemaField('update_datetime', 'DATETIME',
                                         'REQUIRED')
                ]) for uploaded_path in self.fs.uploaded_test_path_to_actual
        ]

        self.assertEqual(expected_insert_calls,
                         self.mock_big_query_client.method_calls)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_uneven_division(self):

        self.import_manager.upload_chunk_size = 2

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        self.fs.test_add_path(file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(3, len(self.fs.uploaded_test_path_to_actual))

        expected_insert_calls = [
            call.insert_into_table_from_cloud_storage_async(
                source_uri=f'gs://{uploaded_path}',
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, 'us_xx_raw_data'),
                destination_table_id='tagPipeSeparatedNonUTF8',
                destination_table_schema=[
                    bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                    bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                    bigquery.SchemaField('update_datetime', 'DATETIME',
                                         'REQUIRED')
                ]) for uploaded_path in self.fs.uploaded_test_path_to_actual
        ]

        self.assertEqual(expected_insert_calls,
                         self.mock_big_query_client.method_calls)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()
Exemple #5
0
 def setUp(self) -> None:
     self.fs = FakeDirectIngestGCSFileSystem()
     self.prioritizer = GcsfsDirectIngestJobPrioritizer(
         self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB'], file_type_filter=None)
Exemple #6
0
class TestGcsfsDirectIngestJobPrioritizerNoFilter(unittest.TestCase):
    """Tests for the GcsfsDirectIngestJobPrioritizer."""

    _DAY_1_TIME_1 = datetime.datetime(
        year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=6789,
        tzinfo=datetime.timezone.utc)

    _DAY_1_TIME_2 = datetime.datetime(
        year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=7789,
        tzinfo=datetime.timezone.utc)

    _DAY_1_TIME_3 = datetime.datetime(
        year=2019, month=1, day=2, hour=10, minute=4, second=5, microsecond=678,
        tzinfo=datetime.timezone.utc)

    _DAY_2_TIME_1 = datetime.datetime(
        year=2019, month=1, day=3, hour=3, minute=4, second=5, microsecond=6789,
        tzinfo=datetime.timezone.utc)

    _DAY_1 = _DAY_1_TIME_1.date()
    _DAY_2 = _DAY_2_TIME_1.date()

    _INGEST_BUCKET_PATH = \
        GcsfsDirectoryPath.from_absolute_path('direct/regions/us_nd/fixtures')

    def setUp(self) -> None:
        self.fs = FakeDirectIngestGCSFileSystem()
        self.prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB'], file_type_filter=None)

    FIXTURE_PATH_PREFIX = 'direct/regions/us_nd/fixtures'

    def _normalized_path_for_filename(self,
                                      filename: str,
                                      file_type: GcsfsDirectIngestFileType,
                                      dt: datetime.datetime) -> GcsfsFilePath:
        normalized_path = \
            to_normalized_unprocessed_file_path(
                original_file_path=os.path.join(self._INGEST_BUCKET_PATH.abs_path(), filename),
                file_type=file_type,
                dt=dt)
        return GcsfsFilePath.from_absolute_path(normalized_path)

    def _process_jobs_for_paths_with_no_gaps_in_expected_order(
            self, paths: List[GcsfsFilePath]):
        for path in paths:
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            if next_job_args is None:
                # Make mypy happy
                self.fail()
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            self.assertTrue(
                self.prioritizer.are_more_jobs_expected_for_day(date_str))

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

    def test_empty_fs(self):
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1_TIME_1.date().isoformat()))
        self.assertIsNone(self.prioritizer.get_next_job_args())

    def test_single_expected_file(self):
        path = self._normalized_path_for_filename(
            'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_1)

        self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order([path])

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagB
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_multiple_files(self):

        paths = [
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_1),
            self._normalized_path_for_filename(
                'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_2)
        ]

        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_unexpected_file(self):
        # Only file is out of order
        path = self._normalized_path_for_filename(
            'tagB.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_1)

        self.fs.test_add_path(path)

        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

        next_job_args = self.prioritizer.get_next_job_args()
        self.assertIsNotNone(next_job_args)
        self.assertEqual(next_job_args.file_path, path)
        self.assertFalse(self.prioritizer.are_next_args_expected(next_job_args))

        # ... job runs eventually even though unexpected...

        self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())

        # We still expect a file for tagA
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_files_on_multiple_days(self):
        paths = [
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_1),
            self._normalized_path_for_filename(
                'tagB.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_2_TIME_1),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))

    def test_files_on_multiple_days_with_gap(self):
        """Runs a test where there are files on multiple days and there is a gap
        in the expected files for the first day.
        """
        paths = [
            self._normalized_path_for_filename(
                'tagB.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_2_TIME_1),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            self.assertEqual(next_job_args.file_path, path)

            are_args_expected = \
                self.prioritizer.are_next_args_expected(next_job_args)
            if i == 0:
                self.assertFalse(are_args_expected)
            else:
                self.assertTrue(are_args_expected)

            self.assertTrue(
                self.prioritizer.are_more_jobs_expected_for_day(date_str))

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
        self.assertTrue(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_2.isoformat()))

    def test_multiple_files_same_tag(self):
        paths = [
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_1),
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_multiple_files_times_out_of_order(self):
        """Runs a test where there are no gaps but the files have been added
        (i.e. have creation times) out of order.
        """
        paths = [
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                'tagB.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_1),
            self._normalized_path_for_filename(
                'tagB.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_3),

        ]
        for path in paths:
            self.fs.test_add_path(path)

        for i, path in enumerate(paths):
            date_str = filename_parts_from_path(path).date_str
            next_job_args = self.prioritizer.get_next_job_args()
            self.assertIsNotNone(next_job_args)
            self.assertEqual(next_job_args.file_path, path)
            self.assertTrue(
                self.prioritizer.are_next_args_expected(next_job_args))

            are_more_jobs_expected = \
                self.prioritizer.are_more_jobs_expected_for_day(date_str)
            if i == 2:
                self.assertFalse(are_more_jobs_expected)
            else:
                self.assertTrue(are_more_jobs_expected)

            # ... job runs ...

            self.fs.mv_path_to_processed_path(path)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))

    def test_run_multiple_copies_of_same_tag(self):
        paths = [
            self._normalized_path_for_filename(
                'tagA.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_2),
            self._normalized_path_for_filename(
                'tagA_2.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_1),
            self._normalized_path_for_filename(
                'tagB.csv', GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_1_TIME_3),
        ]
        for path in paths:
            self.fs.test_add_path(path)

        self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths)

        self.assertIsNone(self.prioritizer.get_next_job_args())
        self.assertFalse(
            self.prioritizer.are_more_jobs_expected_for_day(
                self._DAY_1.isoformat()))
Exemple #7
0
 def setUp(self) -> None:
     self.fs = FakeDirectIngestGCSFileSystem()
Exemple #8
0
class TestFakeDirectIngestGcsFileSystem(TestCase):
    """Tests for the DirectIngestGCSFileSystem."""

    STORAGE_DIR_PATH = GcsfsDirectoryPath(bucket_name='storage_bucket',
                                          relative_path='region_subdir')

    INGEST_DIR_PATH = GcsfsDirectoryPath(bucket_name='my_bucket')

    def setUp(self) -> None:
        self.fs = FakeDirectIngestGCSFileSystem()

    def fully_process_file(self,
                           dt: datetime.datetime,
                           path: GcsfsFilePath,
                           file_type_differentiation_on: bool = False):
        """Mimics all the file system calls for a single file in the direct
        ingest system, from getting added to the ingest bucket, turning to a
        processed file, then getting moved to storage."""

        self.fs.test_add_path(path)

        start_num_total_files = len(self.fs.all_paths)
        # pylint: disable=protected-access
        start_ingest_paths = self.fs._ls_with_file_prefix(
            self.INGEST_DIR_PATH, '', None)
        start_storage_paths = self.fs._ls_with_file_prefix(
            self.STORAGE_DIR_PATH, '', None)
        if file_type_differentiation_on:
            start_raw_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            start_ingest_view_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW)
        else:
            start_raw_storage_paths = []
            start_ingest_view_storage_paths = []

        # File is renamed to normalized path
        file_type = GcsfsDirectIngestFileType.RAW_DATA \
            if file_type_differentiation_on else GcsfsDirectIngestFileType.UNSPECIFIED

        self.fs.mv_path_to_normalized_path(path, file_type, dt)

        if file_type_differentiation_on:
            raw_unprocessed = self.fs.get_unprocessed_file_paths(
                self.INGEST_DIR_PATH,
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            self.assertEqual(len(raw_unprocessed), 1)
            self.assertTrue(
                self.fs.is_seen_unprocessed_file(raw_unprocessed[0]))

            # ... raw file imported to BQ

            processed_path = self.fs.mv_path_to_processed_path(
                raw_unprocessed[0])

            processed = self.fs.get_processed_file_paths(
                self.INGEST_DIR_PATH, None)
            self.assertEqual(len(processed), 1)

            self.fs.copy(
                processed_path,
                GcsfsFilePath.from_absolute_path(
                    to_normalized_unprocessed_file_path_from_normalized_path(
                        processed_path.abs_path(),
                        file_type_override=GcsfsDirectIngestFileType.
                        INGEST_VIEW)))
            self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH)

        ingest_unprocessed_filter = GcsfsDirectIngestFileType.INGEST_VIEW if file_type_differentiation_on else None

        ingest_unprocessed = self.fs.get_unprocessed_file_paths(
            self.INGEST_DIR_PATH, file_type_filter=ingest_unprocessed_filter)
        self.assertEqual(len(ingest_unprocessed), 1)
        self.assertTrue(self.fs.is_seen_unprocessed_file(
            ingest_unprocessed[0]))

        # ... file is ingested

        # File is moved to processed path
        self.fs.mv_path_to_processed_path(ingest_unprocessed[0])
        processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH,
                                                     None)
        self.assertEqual(len(processed), 1)
        self.assertTrue(self.fs.is_processed_file(processed[0]))

        unprocessed = self.fs.get_unprocessed_file_paths(
            self.INGEST_DIR_PATH, None)
        self.assertEqual(len(unprocessed), 0)

        # File is moved to storage
        ingest_move_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \
            if file_type_differentiation_on else None

        self.fs.mv_processed_paths_before_date_to_storage(
            self.INGEST_DIR_PATH,
            self.STORAGE_DIR_PATH,
            date_str_bound=dt.date().isoformat(),
            include_bound=True,
            file_type_filter=ingest_move_type_filter)

        end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH,
                                                        '',
                                                        file_type_filter=None)
        end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                         '',
                                                         file_type_filter=None)
        if file_type_differentiation_on:
            end_raw_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA)
            end_ingest_view_storage_paths = self.fs._ls_with_file_prefix(
                self.STORAGE_DIR_PATH,
                '',
                file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW)
        else:
            end_raw_storage_paths = []
            end_ingest_view_storage_paths = []

        # Each file gets re-exported as ingest view
        splitting_factor = 2 if file_type_differentiation_on else 1

        expected_final_total_files = start_num_total_files + splitting_factor - 1
        self.assertEqual(len(self.fs.all_paths), expected_final_total_files)
        self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1)
        self.assertEqual(len(end_storage_paths),
                         len(start_storage_paths) + 1 * splitting_factor)
        if file_type_differentiation_on:
            self.assertEqual(
                len(end_raw_storage_paths) +
                len(end_ingest_view_storage_paths), len(end_storage_paths))
            self.assertEqual(len(end_raw_storage_paths),
                             len(start_raw_storage_paths) + 1)
            self.assertEqual(len(end_ingest_view_storage_paths),
                             len(start_ingest_view_storage_paths) + 1)

        for sp in end_storage_paths:
            parts = filename_parts_from_path(sp)
            if sp.abs_path() not in {
                    p.abs_path()
                    for p in start_storage_paths
            }:
                self.assertTrue(sp.abs_path().startswith(
                    self.STORAGE_DIR_PATH.abs_path()))
                dir_path, storage_file_name = os.path.split(sp.abs_path())
                if parts.file_type != GcsfsDirectIngestFileType.UNSPECIFIED:
                    self.assertTrue(parts.file_type.value in dir_path)
                name, _ = path.file_name.split('.')
                self.assertTrue(name in storage_file_name)

    def test_direct_ingest_file_moves(self):
        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

    def test_direct_ingest_multiple_file_moves(self):
        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        self.fully_process_file(
            datetime.datetime.now(),
            GcsfsFilePath(bucket_name='my_bucket',
                          blob_name='test_file_2.csv'))

    def test_move_to_storage_with_conflict(self):
        dt = datetime.datetime.now()
        self.fully_process_file(
            dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(
            dt,
            GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'))

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                     '',
                                                     file_type_filter=None)
        self.assertEqual(len(storage_paths), 2)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            self.assertTrue(filename_parts_from_path(path))
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)

    def test_direct_ingest_file_moves_with_file_types(self):
        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

    def test_direct_ingest_multiple_file_moves_with_file_types(self):
        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        self.fully_process_file(datetime.datetime.now(),
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file_2.csv'),
                                file_type_differentiation_on=True)

    def test_move_to_storage_with_conflict_with_file_types(self):
        dt = datetime.datetime.now()
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # Try uploading a file with a duplicate name that has already been
        # moved to storage
        self.fully_process_file(dt,
                                GcsfsFilePath(bucket_name='my_bucket',
                                              blob_name='test_file.csv'),
                                file_type_differentiation_on=True)

        # pylint: disable=protected-access
        storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH,
                                                     '',
                                                     file_type_filter=None)
        self.assertEqual(len(storage_paths), 4)

        found_first_file = False
        found_second_file = False
        for path in storage_paths:
            if path.abs_path().endswith('test_file.csv'):
                found_first_file = True
            if path.abs_path().endswith('test_file-(1).csv'):
                found_second_file = True

        self.assertTrue(found_first_file)
        self.assertTrue(found_second_file)