Example #1
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
Example #2
0
    def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None:
        """Initialize the controller."""
        self.cloud_task_manager = DirectIngestCloudTaskManagerImpl()
        self.ingest_instance = DirectIngestInstance.for_ingest_bucket(
            ingest_bucket_path)
        self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest(
            region_code=self.region.region_code,
            schema_type=self.system_level.schema_type(),
            ingest_instance=self.ingest_instance,
        )
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.ingest_bucket_path = ingest_bucket_path
        self.storage_directory_path = (
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code(),
                system_level=self.system_level,
                ingest_instance=self.ingest_instance,
            ))

        self.temp_output_directory_path = (
            gcsfs_direct_ingest_temporary_output_directory_path())

        self.file_prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs,
            self.ingest_bucket_path,
            self.get_file_tag_rank_list(),
        )

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code,
            ingest_database_name=self.ingest_database_key.db_name,
        )

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl(),
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            output_bucket_name=self.ingest_bucket_path.bucket_name,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()),
            launched_file_tags=self.get_file_tag_rank_list(),
        )

        self.ingest_instance_status_manager = DirectIngestInstanceStatusManager(
            self.region_code(), self.ingest_instance)
    def setUp(self) -> None:
        self.project_id = "recidiviz-456"
        self.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
        self.project_id_patcher.start().return_value = self.project_id
        self.test_region = fake_region(
            region_code="us_xx", are_raw_data_bq_imports_enabled_in_env=True)

        self.region_module_patcher = patch.object(
            direct_ingest_raw_table_migration_collector,
            "regions",
            new=controller_fixtures,
        )
        self.region_module_patcher.start()

        self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name="direct/controllers/fixtures")
        self.temp_output_path = GcsfsDirectoryPath(bucket_name="temp_bucket")

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = (
            self.mock_import_raw_file_to_big_query)

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client,
        )
        self.import_manager.csv_reader = _TestSafeGcsCsvReader(
            self.fs.gcs_file_system)

        self.time_patcher = patch(
            "recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time"
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
Example #4
0
 def __init__(self,
              *,
              project_id: str = None,
              region_code: str,
              view_id: str,
              view_query_template: str,
              raw_file_config: DirectIngestRawFileConfig):
     view_dataset_id = f'{region_code.lower()}_raw_data_up_to_date_views'
     raw_table_dataset_id = DirectIngestRawFileImportManager.raw_tables_dataset_for_region(
         region_code)
     except_clause = self._except_clause_for_config(raw_file_config)
     datetime_cols_clause = self._datetime_cols_clause_for_config(
         raw_file_config)
     supplemental_order_by_clause = self._supplemental_order_by_clause_for_config(
         raw_file_config)
     super().__init__(
         project_id=project_id,
         dataset_id=view_dataset_id,
         view_id=view_id,
         view_query_template=view_query_template,
         raw_table_dataset_id=raw_table_dataset_id,
         raw_table_name=raw_file_config.file_tag,
         raw_table_primary_key_str=raw_file_config.primary_key_str,
         except_clause=except_clause,
         datetime_cols_clause=datetime_cols_clause,
         supplemental_order_by_clause=supplemental_order_by_clause)
Example #5
0
 def __init__(self,
              *,
              project_id: str = None,
              region_code: str,
              view_id: str,
              view_query_template: str,
              raw_file_config: DirectIngestRawFileConfig):
     if not raw_file_config.primary_key_cols:
         raise ValueError(
             f'Empty primary key list in raw file config with tag [{raw_file_config.file_tag}] during '
             f'construction of DirectIngestRawDataTableBigQueryView')
     view_dataset_id = f'{region_code.lower()}_raw_data_up_to_date_views'
     raw_table_dataset_id = DirectIngestRawFileImportManager.raw_tables_dataset_for_region(
         region_code)
     except_clause = self._except_clause_for_config(raw_file_config)
     datetime_cols_clause = self._datetime_cols_clause_for_config(
         raw_file_config)
     supplemental_order_by_clause = self._supplemental_order_by_clause_for_config(
         raw_file_config)
     super().__init__(
         project_id=project_id,
         dataset_id=view_dataset_id,
         view_id=view_id,
         view_query_template=view_query_template,
         raw_table_dataset_id=raw_table_dataset_id,
         raw_table_name=raw_file_config.file_tag,
         raw_table_primary_key_str=raw_file_config.primary_key_str,
         except_clause=except_clause,
         datetime_cols_clause=datetime_cols_clause,
         supplemental_order_by_clause=supplemental_order_by_clause)
    def test_import_bq_file_feature_not_released_throws(self):
        self.import_manager = DirectIngestRawFileImportManager(
            region=fake_region(region_code='us_xx',
                               are_raw_data_bq_imports_enabled_in_env=False),
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))
    def setUp(self) -> None:
        self.project_id = 'recidiviz-456'
        self.test_region = fake_region(
            region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True)
        self.fs = FakeDirectIngestGCSFileSystem()
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name='direct/controllers/fixtures')
        self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket')

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \
            self.mock_import_raw_file_to_big_query

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)
        self.import_manager.csv_reader = TestSafeGcsCsvReader(self.fs)

        self.time_patcher = patch(
            'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time'
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
class DirectIngestRawFileImportManagerTest(unittest.TestCase):
    """Tests for DirectIngestRawFileImportManager."""
    def setUp(self) -> None:
        self.project_id = "recidiviz-456"
        self.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
        self.project_id_patcher.start().return_value = self.project_id
        self.test_region = fake_region(
            region_code="us_xx", are_raw_data_bq_imports_enabled_in_env=True)

        self.region_module_patcher = patch.object(
            direct_ingest_raw_table_migration_collector,
            "regions",
            new=controller_fixtures,
        )
        self.region_module_patcher.start()

        self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name="direct/controllers/fixtures")
        self.temp_output_path = GcsfsDirectoryPath(bucket_name="temp_bucket")

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx",
            yaml_config_file_dir=fixtures.as_filepath("us_xx"),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = (
            self.mock_import_raw_file_to_big_query)

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client,
        )
        self.import_manager.csv_reader = _TestSafeGcsCsvReader(
            self.fs.gcs_file_system)

        self.time_patcher = patch(
            "recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time"
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref

    def tearDown(self) -> None:
        self.time_patcher.stop()
        self.region_module_patcher.stop()
        self.project_id_patcher.stop()

    def mock_import_raw_file_to_big_query(
        self,
        *,
        source_uri: str,
        destination_table_schema: List[bigquery.SchemaField],
        **_kwargs: Any,
    ) -> mock.MagicMock:
        col_names = [
            schema_field.name for schema_field in destination_table_schema
        ]
        temp_path = GcsfsFilePath.from_absolute_path(source_uri)
        local_temp_path = self.fs.gcs_file_system.real_absolute_path_for_path(
            temp_path)

        df = pd.read_csv(local_temp_path, header=None, dtype=str)
        for value in df.values:
            for cell in value:
                if isinstance(cell, str):
                    stripped_cell = cell.strip()
                    if stripped_cell != cell:
                        raise ValueError(
                            "Did not strip white space from raw data cell")

                if cell in col_names:
                    raise ValueError(
                        f"Wrote column row to output file: {value}")
        self.num_lines_uploaded += len(df)

        return mock.MagicMock()

    def _metadata_for_unprocessed_file_path(
            self, path: GcsfsFilePath) -> DirectIngestFileMetadata:
        parts = filename_parts_from_path(path)
        return DirectIngestFileMetadata(
            region_code=self.test_region.region_code,
            file_tag=parts.file_tag,
            file_id=123,
            processed_time=None,
        )

    def _check_no_temp_files_remain(self) -> None:
        for path in self.fs.gcs_file_system.all_paths:
            if path.abs_path().startswith(self.temp_output_path.abs_path()):
                self.fail(
                    f"Expected temp path {path.abs_path()} to be cleaned up")

    def test_get_unprocessed_raw_files_to_import(self) -> None:
        self.assertEqual(
            [], self.import_manager.get_unprocessed_raw_files_to_import())

        raw_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_second.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            raw_unprocessed,
                                            has_fixture=False)
        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            ingest_view_unprocessed,
                                            has_fixture=False)

        self.assertEqual(
            [raw_unprocessed],
            self.import_manager.get_unprocessed_raw_files_to_import())

    def test_import_bq_file_not_in_tags(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="this_path_tag_not_in_yaml.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_ingest_view_file(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_unspecified_type_file(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.UNSPECIFIED,
        )

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_feature_not_released_throws(self) -> None:
        self.import_manager = DirectIngestRawFileImportManager(
            region=fake_region(region_code="us_xx",
                               are_raw_data_bq_imports_enabled_in_env=False),
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client,
        )

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_raw_file(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagC",
            destination_table_schema=[
                bigquery.SchemaField("COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(2, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_alternate_separator_and_encoding(
        self, ) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagPipeSeparatedNonUTF8.txt",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagPipeSeparatedNonUTF8",
            destination_table_schema=[
                bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_even_division(self) -> None:

        self.import_manager.upload_chunk_size = 1

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagPipeSeparatedNonUTF8.txt",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(5, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(
                source_uri=uploaded_path.uri(),
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, "us_xx_raw_data"),
                destination_table_id="tagPipeSeparatedNonUTF8",
                destination_table_schema=[
                    bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                    bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                    bigquery.SchemaField("update_datetime", "DATETIME",
                                         "REQUIRED"),
                ],
            ) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_uneven_division(self) -> None:

        self.import_manager.upload_chunk_size = 2

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagPipeSeparatedNonUTF8.txt",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(
                source_uri=uploaded_path.uri(),
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, "us_xx_raw_data"),
                destination_table_id="tagPipeSeparatedNonUTF8",
                destination_table_schema=[
                    bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                    bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                    bigquery.SchemaField("update_datetime", "DATETIME",
                                         "REQUIRED"),
                ],
            ) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_invalid_column_chars(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagInvalidCharacters.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagInvalidCharacters",
            destination_table_schema=[
                bigquery.SchemaField("COL_1", "STRING", "NULLABLE"),
                bigquery.SchemaField("_COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("_3COL", "STRING", "NULLABLE"),
                bigquery.SchemaField("_4_COL", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(1, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_normalization_conflict(self) -> None:
        with self.assertRaises(ValueError) as e:
            file_path = path_for_fixture_file_in_test_gcs_directory(
                directory=self.ingest_directory_path,
                filename="tagNormalizationConflict.csv",
                should_normalize=True,
                file_type=GcsfsDirectIngestFileType.RAW_DATA,
            )

            fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                                file_path)

            self.import_manager.import_raw_file_to_big_query(
                file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(
            str(e.exception),
            "Multiple columns with name [_4COL] after normalization.")

    def test_import_bq_file_with_migrations(self) -> None:
        file_datetime = migrations_tagC.DATE_1
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
            dt=file_datetime,
        )
        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.mock_big_query_client.run_query_async.assert_has_calls([
            mock.call(
                query_str=
                f"UPDATE `recidiviz-456.us_xx_raw_data.tagC` SET COL1 = '456' WHERE COL1 = '123' AND update_datetime = '{file_datetime.isoformat()}';"
            ),
            mock.call(
                query_str=
                "DELETE FROM `recidiviz-456.us_xx_raw_data.tagC` WHERE COL1 = '789';"
            ),
        ])
class GcsfsDirectIngestController(
        BaseDirectIngestController[GcsfsIngestArgs, GcsfsFileContentsHandle]):
    """Controller for parsing and persisting a file in the GCS filesystem."""

    _MAX_STORAGE_FILE_RENAME_TRIES = 10
    _DEFAULT_MAX_PROCESS_JOB_WAIT_TIME_SEC = 300
    _INGEST_FILE_SPLIT_LINE_LIMIT = 2500

    def __init__(
        self,
        region_name: str,
        system_level: SystemLevel,
        ingest_directory_path: Optional[str] = None,
        storage_directory_path: Optional[str] = None,
        max_delay_sec_between_files: Optional[int] = None,
    ):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = gcsfs_direct_ingest_directory_path_for_region(
                region_name, system_level)
        self.ingest_directory_path = GcsfsDirectoryPath.from_absolute_path(
            ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = (
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level))

        self.storage_directory_path = GcsfsDirectoryPath.from_absolute_path(
            storage_directory_path)

        self.temp_output_directory_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = (
            GcsfsDirectIngestFileType.INGEST_VIEW
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            None)
        self.file_prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs,
            self.ingest_directory_path,
            self.get_file_tag_rank_list(),
            ingest_job_file_type_filter,
        )

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl(),
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()),
            launched_file_tags=self.get_file_tag_rank_list(),
        )

    # ================= #
    # NEW FILE HANDLING #
    # ================= #
    def handle_file(self, path: GcsfsFilePath, start_ingest: bool) -> None:
        """Called when a single new file is added to an ingest bucket (may also
        be called as a result of a rename).

        May be called from any worker/queue.
        """
        if self.fs.is_processed_file(path):
            logging.info("File [%s] is already processed, returning.",
                         path.abs_path())
            return

        if self.fs.is_normalized_file_path(path):
            parts = filename_parts_from_path(path)

            if (parts.is_file_split and parts.file_split_size
                    and parts.file_split_size <=
                    self.ingest_file_split_line_limit):
                self.kick_scheduler(just_finished_job=False)
                logging.info(
                    "File [%s] is already normalized and split split "
                    "with correct size, kicking scheduler.",
                    path.abs_path(),
                )
                return

        logging.info("Creating cloud task to schedule next job.")
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=self.region, can_start_ingest=start_ingest)

    def _register_all_new_paths_in_metadata(
            self, paths: List[GcsfsFilePath]) -> None:
        for path in paths:
            if not self.file_metadata_manager.has_file_been_discovered(path):
                self.file_metadata_manager.mark_file_as_discovered(path)

    @trace.span
    def handle_new_files(self, can_start_ingest: bool) -> None:
        """Searches the ingest directory for new/unprocessed files. Normalizes
        file names and splits files as necessary, schedules the next ingest job
        if allowed.


        Should only be called from the scheduler queue.
        """
        if not can_start_ingest and self.region.is_ingest_launched_in_env():
            raise ValueError(
                "The can_start_ingest flag should only be used for regions where ingest is not yet launched in a "
                "particular environment. If we want to be able to selectively pause ingest processing for a state, we "
                "will first have to build a config that is respected by both the /ensure_all_file_paths_normalized "
                "endpoint and any cloud functions that trigger ingest.")

        unnormalized_paths = self.fs.get_unnormalized_file_paths(
            self.ingest_directory_path)

        unnormalized_path_file_type = (
            GcsfsDirectIngestFileType.RAW_DATA
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            GcsfsDirectIngestFileType.UNSPECIFIED)

        for path in unnormalized_paths:
            logging.info("File [%s] is not yet seen, normalizing.",
                         path.abs_path())
            self.fs.mv_path_to_normalized_path(
                path, file_type=unnormalized_path_file_type)

        if unnormalized_paths:
            logging.info(
                "Normalized at least one path - returning, will handle "
                "normalized files separately.")
            # Normalizing file paths will cause the cloud function that calls
            # this function to be re-triggered.
            return

        if not can_start_ingest:
            logging.warning(
                "Ingest not configured to start post-file normalization - returning."
            )
            return

        check_is_region_launched_in_env(self.region)

        unprocessed_raw_paths = []

        ingest_file_type_filter = (
            GcsfsDirectIngestFileType.INGEST_VIEW
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            None)
        unprocessed_ingest_view_paths = self.fs.get_unprocessed_file_paths(
            self.ingest_directory_path,
            file_type_filter=ingest_file_type_filter)
        if self.region.is_raw_vs_ingest_file_name_detection_enabled():
            unprocessed_raw_paths = self.fs.get_unprocessed_file_paths(
                self.ingest_directory_path,
                file_type_filter=GcsfsDirectIngestFileType.RAW_DATA,
            )
            self._register_all_new_paths_in_metadata(unprocessed_raw_paths)

            if self.region.are_ingest_view_exports_enabled_in_env():
                self._register_all_new_paths_in_metadata(
                    unprocessed_ingest_view_paths)

        unprocessed_paths = unprocessed_raw_paths + unprocessed_ingest_view_paths
        did_split = False
        for path in unprocessed_ingest_view_paths:
            if self._split_file_if_necessary(path):
                did_split = True

        if did_split:
            if self.region.are_ingest_view_exports_enabled_in_env():
                post_split_unprocessed_ingest_view_paths = (
                    self.fs.get_unprocessed_file_paths(
                        self.ingest_directory_path,
                        file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
                    ))
                self._register_all_new_paths_in_metadata(
                    post_split_unprocessed_ingest_view_paths)

            logging.info(
                "Split at least one path - returning, will handle split "
                "files separately.")
            # Writing new split files to storage will cause the cloud function
            # that calls this function to be re-triggered.
            return

        if unprocessed_paths:
            self.schedule_next_ingest_job_or_wait_if_necessary(
                just_finished_job=False)

    def do_raw_data_import(self,
                           data_import_args: GcsfsRawDataBQImportArgs) -> None:
        """Process a raw incoming file by importing it to BQ, tracking it in our metadata tables, and moving it to
        storage on completion.
        """
        check_is_region_launched_in_env(self.region)
        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            raise ValueError(
                f"Raw data imports not enabled for region [{self.region.region_code}]"
            )

        if not self.fs.exists(data_import_args.raw_data_file_path):
            logging.warning(
                "File path [%s] no longer exists - might have already been "
                "processed or deleted",
                data_import_args.raw_data_file_path,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        file_metadata = self.file_metadata_manager.get_file_metadata(
            data_import_args.raw_data_file_path)

        if file_metadata.processed_time:
            logging.warning(
                "File [%s] is already marked as processed. Skipping file processing.",
                data_import_args.raw_data_file_path.file_name,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        self.raw_file_import_manager.import_raw_file_to_big_query(
            data_import_args.raw_data_file_path, file_metadata)

        if not self.region.are_ingest_view_exports_enabled_in_env():
            # TODO(#3162) This is a stopgap measure for regions that have only partially launched. Delete once SQL
            #  pre-processing is enabled for all direct ingest regions.
            parts = filename_parts_from_path(
                data_import_args.raw_data_file_path)
            ingest_file_tags = self.get_file_tag_rank_list()

            if parts.file_tag in ingest_file_tags:
                self.fs.copy(
                    data_import_args.raw_data_file_path,
                    GcsfsFilePath.from_absolute_path(
                        to_normalized_unprocessed_file_path_from_normalized_path(
                            data_import_args.raw_data_file_path.abs_path(),
                            file_type_override=GcsfsDirectIngestFileType.
                            INGEST_VIEW,
                        )),
                )

        processed_path = self.fs.mv_path_to_processed_path(
            data_import_args.raw_data_file_path)
        self.file_metadata_manager.mark_file_as_processed(
            path=data_import_args.raw_data_file_path)

        self.fs.mv_path_to_storage(processed_path, self.storage_directory_path)
        self.kick_scheduler(just_finished_job=True)

    def do_ingest_view_export(
            self, ingest_view_export_args: GcsfsIngestViewExportArgs) -> None:
        check_is_region_launched_in_env(self.region)
        if not self.region.are_ingest_view_exports_enabled_in_env():
            raise ValueError(
                f"Ingest view exports not enabled for region [{self.region.region_code}]. Passed args: "
                f"{ingest_view_export_args}")

        did_export = self.ingest_view_export_manager.export_view_for_args(
            ingest_view_export_args)
        if (not did_export or not self.file_metadata_manager.
                get_ingest_view_metadata_pending_export()):
            logging.info("Creating cloud task to schedule next job.")
            self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                region=self.region, can_start_ingest=True)

    # ============== #
    # JOB SCHEDULING #
    # ============== #

    def _schedule_any_pre_ingest_tasks(self) -> bool:
        """Schedules any tasks related to SQL preprocessing of new files in preparation for ingest of those files into
        our Postgres database.

        Returns True if any jobs were scheduled or if there were already any pre-ingest jobs scheduled. Returns False if
        there are no remaining ingest jobs to schedule and it is safe to proceed with ingest.
        """
        if self._schedule_raw_data_import_tasks():
            logging.info("Found pre-ingest raw data import tasks to schedule.")
            return True
        # TODO(#3020): We have logic to ensure that we wait 10 min for all files to upload properly before moving on to
        #  ingest. We probably actually need this to happen between raw data import and ingest view export steps - if we
        #  haven't seen all files yet and most recent raw data file came in sometime in the last 10 min, we should wait
        #  to do view exports.
        if self._schedule_ingest_view_export_tasks():
            logging.info("Found pre-ingest view export tasks to schedule.")
            return True
        return False

    def _schedule_raw_data_import_tasks(self) -> bool:
        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            return False

        queue_info = self.cloud_task_manager.get_bq_import_export_queue_info(
            self.region)

        did_schedule = False
        tasks_to_schedule = [
            GcsfsRawDataBQImportArgs(path) for path in
            self.raw_file_import_manager.get_unprocessed_raw_files_to_import()
        ]
        for task_args in tasks_to_schedule:
            # If the file path has not actually been discovered by the metadata manager yet, it likely was just added
            # and a subsequent call to handle_files will register it and trigger another call to this function so we can
            # schedule the appropriate job.
            discovered = self.file_metadata_manager.has_file_been_discovered(
                task_args.raw_data_file_path)
            if discovered and not queue_info.has_task_already_scheduled(
                    task_args):
                self.cloud_task_manager.create_direct_ingest_raw_data_import_task(
                    self.region, task_args)
                did_schedule = True

        return queue_info.has_raw_data_import_jobs_queued() or did_schedule

    def _schedule_ingest_view_export_tasks(self) -> bool:
        """Schedules all pending ingest view export tasks for launched ingest view tags, if they have not been
        scheduled. If tasks are scheduled or are still running, returns True. Otherwise, if it's safe to proceed with
        next steps of ingest, returns False."""

        if not self.region.are_ingest_view_exports_enabled_in_env():
            return False

        queue_info = self.cloud_task_manager.get_bq_import_export_queue_info(
            self.region)
        if queue_info.has_ingest_view_export_jobs_queued():
            # Since we schedule all export jobs at once, after all raw files have been processed, we wait for all of the
            # export jobs to be done before checking if we need to schedule more.
            return True

        did_schedule = False
        tasks_to_schedule = (
            self.ingest_view_export_manager.get_ingest_view_export_task_args())

        rank_list = self.get_file_tag_rank_list()
        ingest_view_name_rank = {
            ingest_view_name: i
            for i, ingest_view_name in enumerate(rank_list)
        }

        # Filter out views that aren't in ingest view tags.
        filtered_tasks_to_schedule = []
        for args in tasks_to_schedule:
            if args.ingest_view_name not in ingest_view_name_rank:
                logging.warning(
                    "Skipping ingest view task export for [%s] - not in controller ingest tags.",
                    args.ingest_view_name,
                )
                continue
            filtered_tasks_to_schedule.append(args)

        tasks_to_schedule = filtered_tasks_to_schedule

        # Sort by tag order and export datetime
        tasks_to_schedule.sort(key=lambda args: (
            ingest_view_name_rank[args.ingest_view_name],
            args.upper_bound_datetime_to_export,
        ))

        for task_args in tasks_to_schedule:
            if not queue_info.has_task_already_scheduled(task_args):
                self.cloud_task_manager.create_direct_ingest_ingest_view_export_task(
                    self.region, task_args)
                did_schedule = True

        return did_schedule

    @classmethod
    @abc.abstractmethod
    def get_file_tag_rank_list(cls) -> List[str]:
        pass

    def _get_next_job_args(self) -> Optional[GcsfsIngestArgs]:
        args = self.file_prioritizer.get_next_job_args()

        if not self.region.are_ingest_view_exports_enabled_in_env():
            return args

        if not args:
            return None

        discovered = self.file_metadata_manager.has_file_been_discovered(
            args.file_path)

        if not discovered:
            # If the file path has not actually been discovered by the controller yet, it likely was just added and a
            # subsequent call to handle_files will register it and trigger another call to this function so we can
            # schedule the appropriate job.
            logging.info(
                "Found args [%s] for a file that has not been discovered by the metadata manager yet - not scheduling.",
                args,
            )
            return None

        return args

    def _wait_time_sec_for_next_args(self, args: GcsfsIngestArgs) -> int:
        if self.file_prioritizer.are_next_args_expected(args):
            # Run job immediately
            return 0

        now = datetime.datetime.utcnow()
        file_upload_time: datetime.datetime = filename_parts_from_path(
            args.file_path).utc_upload_datetime

        max_delay_sec = (self.max_delay_sec_between_files
                         if self.max_delay_sec_between_files is not None else
                         self._DEFAULT_MAX_PROCESS_JOB_WAIT_TIME_SEC)
        max_wait_from_file_upload_time = file_upload_time + datetime.timedelta(
            seconds=max_delay_sec)

        if max_wait_from_file_upload_time <= now:
            wait_time = 0
        else:
            wait_time = (max_wait_from_file_upload_time - now).seconds

        logging.info("Waiting [%s] sec for [%s]", wait_time,
                     self._job_tag(args))
        return wait_time

    def _on_job_scheduled(self, ingest_args: GcsfsIngestArgs) -> None:
        pass

    # =================== #
    # SINGLE JOB RUN CODE #
    # =================== #

    def _job_tag(self, args: GcsfsIngestArgs) -> str:
        return (f"{self.region.region_code}/{args.file_path.file_name}:"
                f"{args.ingest_time}")

    def _get_contents_handle(
            self, args: GcsfsIngestArgs) -> Optional[GcsfsFileContentsHandle]:
        return self._get_contents_handle_from_path(args.file_path)

    def _get_contents_handle_from_path(
            self, path: GcsfsFilePath) -> Optional[GcsfsFileContentsHandle]:
        return self.fs.download_to_temp_file(path)

    @abc.abstractmethod
    def _are_contents_empty(self, args: GcsfsIngestArgs,
                            contents_handle: GcsfsFileContentsHandle) -> bool:
        pass

    def _can_proceed_with_ingest_for_contents(
            self, args: GcsfsIngestArgs,
            contents_handle: GcsfsFileContentsHandle) -> bool:
        parts = filename_parts_from_path(args.file_path)
        return self._are_contents_empty(
            args, contents_handle) or not self._must_split_contents(
                parts.file_type, args.file_path)

    def _must_split_contents(self, file_type: GcsfsDirectIngestFileType,
                             path: GcsfsFilePath) -> bool:
        if (self.region.is_raw_vs_ingest_file_name_detection_enabled()
                and file_type == GcsfsDirectIngestFileType.RAW_DATA):
            return False

        return not self._file_meets_file_line_limit(
            self.ingest_file_split_line_limit, path)

    @abc.abstractmethod
    def _file_meets_file_line_limit(self, line_limit: int,
                                    path: GcsfsFilePath) -> bool:
        """Subclasses should implement to determine whether the file meets the
        expected line limit"""

    @abc.abstractmethod
    def _parse(self, args: GcsfsIngestArgs,
               contents_handle: GcsfsFileContentsHandle) -> IngestInfo:
        pass

    def _should_split_file(self, path: GcsfsFilePath) -> bool:
        """Returns a handle to the contents of this path if this file should be split, None otherwise."""
        parts = filename_parts_from_path(path)

        if (self.region.is_raw_vs_ingest_file_name_detection_enabled()
                and parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW):
            raise ValueError(
                f"Should not be attempting to split files other than ingest view files, found path with "
                f"file type: {parts.file_type}")

        if parts.file_tag not in self.get_file_tag_rank_list():
            logging.info(
                "File tag [%s] for path [%s] not in rank list - not splitting.",
                parts.file_tag,
                path.abs_path(),
            )
            return False

        if (parts.is_file_split and parts.file_split_size and
                parts.file_split_size <= self.ingest_file_split_line_limit):
            logging.info(
                "File [%s] already split with size [%s].",
                path.abs_path(),
                parts.file_split_size,
            )
            return False

        return self._must_split_contents(parts.file_type, path)

    @trace.span
    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(
                path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        upload_paths = []
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i,
                upload_path.abs_path(),
            )

            upload_paths.append(upload_path)
            try:
                self.fs.mv(split_contents_path, upload_path)
            except Exception as e:
                logging.error(
                    "Threw error while copying split files from temp bucket - attempting to clean up before rethrowing."
                    " [%s]",
                    e,
                )
                for p in upload_paths:
                    self.fs.delete(p)
                raise e

        # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving
        # the metadata manager in an inconsistent state.
        if self.region.are_ingest_view_exports_enabled_in_env():
            if not isinstance(original_metadata,
                              DirectIngestIngestFileMetadata):
                raise ValueError(
                    "Attempting to split a non-ingest view type file")

            logging.info(
                "Registering [%s] split files with the metadata manager.",
                len(upload_paths),
            )

            for upload_path in upload_paths:
                ingest_file_metadata = (
                    self.file_metadata_manager.register_ingest_file_split(
                        original_metadata, upload_path))
                self.file_metadata_manager.mark_ingest_view_exported(
                    ingest_file_metadata)

            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(),
            len(split_contents_paths),
        )

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True

    def _create_split_file_path(
        self,
        original_file_path: GcsfsFilePath,
        output_dir: GcsfsDirectoryPath,
        split_num: int,
    ) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f"{parts.stripped_file_name}_{rank_str}"
            f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}"
            f".{parts.extension}")

        file_type = (
            GcsfsDirectIngestFileType.INGEST_VIEW
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            GcsfsDirectIngestFileType.UNSPECIFIED)

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                file_type=file_type,
                                                dt=parts.utc_upload_datetime),
        )

    @abc.abstractmethod
    def _split_file(self, path: GcsfsFilePath) -> List[GcsfsFilePath]:
        """Should be implemented by subclasses to split a file accessible via the provided path into multiple
        files and upload those files to GCS. Returns the list of upload paths."""

    def _do_cleanup(self, args: GcsfsIngestArgs) -> None:
        self.fs.mv_path_to_processed_path(args.file_path)

        if self.region.are_ingest_view_exports_enabled_in_env():
            self.file_metadata_manager.mark_file_as_processed(args.file_path)

        parts = filename_parts_from_path(args.file_path)
        self._move_processed_files_to_storage_as_necessary(
            last_processed_date_str=parts.date_str)

    def _is_last_job_for_day(self, args: GcsfsIngestArgs) -> bool:
        """Returns True if the file handled in |args| is the last file for that
        upload date."""
        parts = filename_parts_from_path(args.file_path)
        upload_date, date_str = parts.utc_upload_datetime, parts.date_str
        more_jobs_expected = self.file_prioritizer.are_more_jobs_expected_for_day(
            date_str)
        if more_jobs_expected:
            return False
        next_job_args = self.file_prioritizer.get_next_job_args(date_str)
        if next_job_args:
            next_job_date = filename_parts_from_path(
                next_job_args.file_path).utc_upload_datetime
            return next_job_date > upload_date
        return True

    def _move_processed_files_to_storage_as_necessary(
            self, last_processed_date_str: str) -> None:
        """Moves files that have already been ingested/processed, up to and including the given date, into storage,
        if there is nothing more left to ingest/process, i.e. we are not expecting more files."""
        next_args = self.file_prioritizer.get_next_job_args()

        should_move_last_processed_date = False
        if not next_args:
            are_more_jobs_expected = (
                self.file_prioritizer.are_more_jobs_expected_for_day(
                    last_processed_date_str))
            if not are_more_jobs_expected:
                should_move_last_processed_date = True
        else:
            next_date_str = filename_parts_from_path(
                next_args.file_path).date_str
            if next_date_str < last_processed_date_str:
                logging.info("Found a file [%s] from a date previous to our "
                             "last processed date - not moving anything to "
                             "storage.")
                return

            # If there are still more to process on this day, do not move files
            # from this day.
            should_move_last_processed_date = next_date_str != last_processed_date_str

        # Note: at this point, we expect RAW file type files to already have been moved once they were imported to BQ.
        file_type_to_move = (
            GcsfsDirectIngestFileType.INGEST_VIEW
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            None)

        self.fs.mv_processed_paths_before_date_to_storage(
            self.ingest_directory_path,
            self.storage_directory_path,
            file_type_filter=file_type_to_move,
            date_str_bound=last_processed_date_str,
            include_bound=should_move_last_processed_date,
        )

    @staticmethod
    def file_tag(file_path: GcsfsFilePath) -> str:
        return filename_parts_from_path(file_path).file_tag
class DirectIngestRawFileImportManagerTest(unittest.TestCase):
    """Tests for DirectIngestRawFileImportManager."""
    def setUp(self) -> None:
        self.project_id = 'recidiviz-456'
        self.project_id_patcher = patch('recidiviz.utils.metadata.project_id')
        self.project_id_patcher.start().return_value = self.project_id
        self.test_region = fake_region(
            region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True)
        self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
        self.ingest_directory_path = GcsfsDirectoryPath(
            bucket_name='direct/controllers/fixtures')
        self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket')

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \
            self.mock_import_raw_file_to_big_query

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)
        self.import_manager.csv_reader = TestSafeGcsCsvReader(
            self.fs.gcs_file_system)

        self.time_patcher = patch(
            'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time'
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref

    def tearDown(self) -> None:
        self.time_patcher.stop()
        self.project_id_patcher.stop()

    def mock_import_raw_file_to_big_query(
            self, *, source_uri: str,
            destination_table_schema: List[bigquery.SchemaField], **_kwargs):
        col_names = [
            schema_field.name for schema_field in destination_table_schema
        ]
        temp_path = GcsfsFilePath.from_absolute_path(source_uri)
        local_temp_path = self.fs.gcs_file_system.real_absolute_path_for_path(
            temp_path)

        df = pd.read_csv(local_temp_path, header=None, dtype=str)
        for value in df.values:
            for cell in value:
                if isinstance(cell, str):
                    stripped_cell = cell.strip()
                    if stripped_cell != cell:
                        raise ValueError(
                            'Did not strip white space from raw data cell')

                if cell in col_names:
                    raise ValueError(
                        f'Wrote column row to output file: {value}')
        self.num_lines_uploaded += len(df)

        return mock.MagicMock()

    def _metadata_for_unprocessed_file_path(
            self, path: GcsfsFilePath) -> DirectIngestFileMetadata:
        parts = filename_parts_from_path(path)
        return DirectIngestFileMetadata(
            region_code=self.test_region.region_code,
            file_tag=parts.file_tag,
            file_id=123,
            processed_time=None)

    def _check_no_temp_files_remain(self):
        for path in self.fs.gcs_file_system.all_paths:
            if path.abs_path().startswith(self.temp_output_path.abs_path()):
                self.fail(
                    f'Expected temp path {path.abs_path()} to be cleaned up')

    def test_get_unprocessed_raw_files_to_import(self):
        self.assertEqual(
            [], self.import_manager.get_unprocessed_raw_files_to_import())

        raw_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_second.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            raw_unprocessed,
                                            has_fixture=False)
        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                            ingest_view_unprocessed,
                                            has_fixture=False)

        self.assertEqual(
            [raw_unprocessed],
            self.import_manager.get_unprocessed_raw_files_to_import())

    def test_import_bq_file_not_in_tags(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='this_path_tag_not_in_yaml.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_ingest_view_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_unspecified_type_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.UNSPECIFIED)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_feature_not_released_throws(self):
        self.import_manager = DirectIngestRawFileImportManager(
            region=fake_region(region_code='us_xx',
                               are_raw_data_bq_imports_enabled_in_env=False),
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client)

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='file_tag_first.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        with self.assertRaises(ValueError):
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestFileMetadata))

    def test_import_bq_file_with_raw_file(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagC.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagC',
            destination_table_schema=[
                bigquery.SchemaField('COL1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(2, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_alternate_separator_and_encoding(
            self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagPipeSeparatedNonUTF8',
            destination_table_schema=[
                bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_even_division(self):

        self.import_manager.upload_chunk_size = 1

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(5, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(source_uri=uploaded_path.uri(),
                 destination_dataset_ref=bigquery.DatasetReference(
                     self.project_id, 'us_xx_raw_data'),
                 destination_table_id='tagPipeSeparatedNonUTF8',
                 destination_table_schema=[
                     bigquery.SchemaField('PRIMARY_COL1', 'STRING',
                                          'NULLABLE'),
                     bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                     bigquery.SchemaField('update_datetime', 'DATETIME',
                                          'REQUIRED')
                 ]) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_uneven_division(self):

        self.import_manager.upload_chunk_size = 2

        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagPipeSeparatedNonUTF8.txt',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(source_uri=uploaded_path.uri(),
                 destination_dataset_ref=bigquery.DatasetReference(
                     self.project_id, 'us_xx_raw_data'),
                 destination_table_id='tagPipeSeparatedNonUTF8',
                 destination_table_schema=[
                     bigquery.SchemaField('PRIMARY_COL1', 'STRING',
                                          'NULLABLE'),
                     bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'),
                     bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                     bigquery.SchemaField('update_datetime', 'DATETIME',
                                          'REQUIRED')
                 ]) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_invalid_column_chars(self):
        file_path = path_for_fixture_file_in_test_gcs_directory(
            directory=self.ingest_directory_path,
            filename='tagInvalidCharacters.csv',
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA)

        fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, 'us_xx_raw_data'),
            destination_table_id='tagInvalidCharacters',
            destination_table_schema=[
                bigquery.SchemaField('COL_1', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('_COL2', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('_3COL', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('_4_COL', 'STRING', 'NULLABLE'),
                bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'),
                bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED')
            ])
        self.assertEqual(1, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_normalization_conflict(self):
        with self.assertRaises(ValueError) as e:
            file_path = path_for_fixture_file_in_test_gcs_directory(
                directory=self.ingest_directory_path,
                filename='tagNormalizationConflict.csv',
                should_normalize=True,
                file_type=GcsfsDirectIngestFileType.RAW_DATA)

            fixture_util.add_direct_ingest_path(self.fs.gcs_file_system,
                                                file_path)

            self.import_manager.import_raw_file_to_big_query(
                file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(
            str(e.exception),
            "Multiple columns with name [_4COL] after normalization.")
class DirectIngestRawFileImportManagerTest(unittest.TestCase):
    """Tests for DirectIngestRawFileImportManager."""
    def setUp(self) -> None:
        self.project_id = "recidiviz-456"
        self.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
        self.project_id_patcher.start().return_value = self.project_id
        self.test_region = fake_region(region_code="us_xx",
                                       region_module=fake_regions_module)

        self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem())
        self.ingest_bucket_path = GcsfsBucketPath(
            bucket_name="my_ingest_bucket")
        self.temp_output_path = GcsfsDirectoryPath(bucket_name="temp_bucket")

        self.region_raw_file_config = DirectIngestRegionRawFileConfig(
            region_code="us_xx", region_module=fake_regions_module)

        self.mock_big_query_client = create_autospec(BigQueryClient)
        self.num_lines_uploaded = 0

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = (
            self.mock_import_raw_file_to_big_query)

        self.import_manager = DirectIngestRawFileImportManager(
            region=self.test_region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_path,
            region_raw_file_config=self.region_raw_file_config,
            big_query_client=self.mock_big_query_client,
        )
        self.import_manager.csv_reader = GcsfsCsvReader(
            self.fs.gcs_file_system)

        self.time_patcher = patch(
            "recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time"
        )
        self.mock_time = self.time_patcher.start()

        def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference:
            return bigquery.DatasetReference(project=self.project_id,
                                             dataset_id=dataset_id)

        self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref

    def tearDown(self) -> None:
        self.time_patcher.stop()
        self.project_id_patcher.stop()

    def mock_import_raw_file_to_big_query(
        self,
        *,
        source_uri: str,
        destination_table_schema: List[bigquery.SchemaField],
        **_kwargs: Any,
    ) -> mock.MagicMock:
        col_names = [
            schema_field.name for schema_field in destination_table_schema
        ]
        temp_path = GcsfsFilePath.from_absolute_path(source_uri)
        local_temp_path = self.fs.gcs_file_system.real_absolute_path_for_path(
            temp_path)

        df = pd.read_csv(local_temp_path, header=None, dtype=str)
        for value in df.values:
            for cell in value:
                if isinstance(cell, str):
                    stripped_cell = cell.strip()
                    if stripped_cell != cell:
                        raise ValueError(
                            "Did not strip white space from raw data cell")

                if cell in col_names:
                    raise ValueError(
                        f"Wrote column row to output file: {value}")
        self.num_lines_uploaded += len(df)

        return mock.MagicMock()

    def _metadata_for_unprocessed_file_path(
            self, path: GcsfsFilePath) -> DirectIngestRawFileMetadata:
        parts = filename_parts_from_path(path)
        return DirectIngestRawFileMetadata(
            region_code=self.test_region.region_code,
            file_tag=parts.file_tag,
            file_id=123,
            processed_time=None,
            normalized_file_name=path.file_name,
            discovery_time=datetime.datetime.now(),
            datetimes_contained_upper_bound_inclusive=parts.
            utc_upload_datetime,
        )

    def _check_no_temp_files_remain(self) -> None:
        for path in self.fs.gcs_file_system.all_paths:
            if path.abs_path().startswith(self.temp_output_path.abs_path()):
                self.fail(
                    f"Expected temp path {path.abs_path()} to be cleaned up")

    def test_get_unprocessed_raw_files_to_import(self) -> None:
        self.assertEqual(
            [], self.import_manager.get_unprocessed_raw_files_to_import())

        raw_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="file_tag_second.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            raw_unprocessed,
            region_code=self.test_region.region_code,
            has_fixture=False,
        )
        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            ingest_view_unprocessed,
            region_code=self.test_region.region_code,
            has_fixture=False,
        )

        self.assertEqual(
            [raw_unprocessed],
            self.import_manager.get_unprocessed_raw_files_to_import())

    def test_import_bq_file_not_in_tags(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="this_path_tag_not_in_yaml.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        with self.assertRaises(ValueError) as e:
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestRawFileMetadata))
        self.assertEqual(
            str(e.exception),
            "Attempting to import raw file with tag [this_path_tag_not_in_yaml] "
            "unspecified by [us_xx] config.",
        )

    def test_import_wrong_separator_cols_do_not_parse(self) -> None:
        file_config = self.import_manager.region_raw_file_config.raw_file_configs[
            "tagC"]
        updated_file_config = attr.evolve(file_config, separator="#")
        self.import_manager.region_raw_file_config.raw_file_configs[
            "tagC"] = updated_file_config

        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        with self.assertRaises(ValueError) as e:
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestRawFileMetadata))
        self.assertTrue(
            str(e.exception).startswith(
                "Found only one column: [COL1__COL2_COL3]. "
                "Columns likely did not parse properly."))

    def test_import_bq_file_with_ingest_view_file(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="file_tag_first.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.INGEST_VIEW,
        )

        with self.assertRaises(ValueError) as e:
            self.import_manager.import_raw_file_to_big_query(
                file_path, create_autospec(DirectIngestRawFileMetadata))
        self.assertEqual(
            str(e.exception),
            "Unexpected file type [GcsfsDirectIngestFileType.INGEST_VIEW] for "
            "path [file_tag_first].",
        )

    def test_import_bq_file_with_raw_file(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagC",
            destination_table_schema=[
                bigquery.SchemaField("COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(2, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_row_extra_columns(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagRowExtraColumns.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        with self.assertRaisesRegex(ParserError,
                                    "Expected 4 fields in line 3, saw 5"):
            self.import_manager.import_raw_file_to_big_query(
                file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(0, len(self.fs.gcs_file_system.uploaded_paths))
        self._check_no_temp_files_remain()

    # TODO(#7318): This should fail because a row is missing values
    def test_import_bq_file_with_row_missing_columns(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagRowMissingColumns.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagRowMissingColumns",
            destination_table_schema=[
                bigquery.SchemaField("id_column", "STRING", "NULLABLE"),
                bigquery.SchemaField("comment", "STRING", "NULLABLE"),
                bigquery.SchemaField("termination_code", "STRING", "NULLABLE"),
                bigquery.SchemaField("update_date", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(2, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_alternate_separator_and_encoding(
        self, ) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagPipeSeparatedNonUTF8.txt",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagPipeSeparatedNonUTF8",
            destination_table_schema=[
                bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_line_terminator(self, ) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagCustomLineTerminatorNonUTF8.txt",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagCustomLineTerminatorNonUTF8",
            destination_table_schema=[
                bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_multibyte_raw_file_alternate_separator_and_encoding(
        self, ) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagDoubleDaggerWINDOWS1252.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagDoubleDaggerWINDOWS1252",
            destination_table_schema=[
                bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_even_division(self) -> None:

        self.import_manager.upload_chunk_size = 1

        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagPipeSeparatedNonUTF8.txt",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(5, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(
                source_uri=uploaded_path.uri(),
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, "us_xx_raw_data"),
                destination_table_id="tagPipeSeparatedNonUTF8",
                destination_table_schema=[
                    bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                    bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                    bigquery.SchemaField("update_datetime", "DATETIME",
                                         "REQUIRED"),
                ],
            ) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_multiple_chunks_uneven_division(self) -> None:

        self.import_manager.upload_chunk_size = 2

        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagPipeSeparatedNonUTF8.txt",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths))

        expected_insert_calls = [
            call(
                source_uri=uploaded_path.uri(),
                destination_dataset_ref=bigquery.DatasetReference(
                    self.project_id, "us_xx_raw_data"),
                destination_table_id="tagPipeSeparatedNonUTF8",
                destination_table_schema=[
                    bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL2", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL3", "STRING", "NULLABLE"),
                    bigquery.SchemaField("COL4", "STRING", "NULLABLE"),
                    bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                    bigquery.SchemaField("update_datetime", "DATETIME",
                                         "REQUIRED"),
                ],
            ) for uploaded_path in self.fs.gcs_file_system.uploaded_paths
        ]

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls(
            expected_insert_calls, any_order=True)
        self.assertEqual(
            len(expected_insert_calls) - 1, self.mock_time.sleep.call_count)
        self.assertEqual(5, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_invalid_column_chars(self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagInvalidCharacters.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagInvalidCharacters",
            destination_table_schema=[
                bigquery.SchemaField("COL_1", "STRING", "NULLABLE"),
                bigquery.SchemaField("_COL2", "STRING", "NULLABLE"),
                bigquery.SchemaField("_3COL", "STRING", "NULLABLE"),
                bigquery.SchemaField("_4_COL", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(1, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_cols_not_matching_capitalization(
            self) -> None:
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagColCapsDoNotMatchConfig.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
        )

        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths))
        path = one(self.fs.gcs_file_system.uploaded_paths)

        self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with(
            source_uri=path.uri(),
            destination_dataset_ref=bigquery.DatasetReference(
                self.project_id, "us_xx_raw_data"),
            destination_table_id="tagColCapsDoNotMatchConfig",
            destination_table_schema=[
                bigquery.SchemaField("COL1", "STRING", "NULLABLE"),
                bigquery.SchemaField("COL_2", "STRING", "NULLABLE"),
                bigquery.SchemaField("Col3", "STRING", "NULLABLE"),
                bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"),
                bigquery.SchemaField("update_datetime", "DATETIME",
                                     "REQUIRED"),
            ],
        )
        self.assertEqual(2, self.num_lines_uploaded)
        self._check_no_temp_files_remain()

    def test_import_bq_file_with_raw_file_normalization_conflict(self) -> None:
        with self.assertRaises(ValueError) as e:
            file_path = path_for_fixture_file_in_test_gcs_directory(
                bucket_path=self.ingest_bucket_path,
                filename="tagNormalizationConflict.csv",
                should_normalize=True,
                file_type=GcsfsDirectIngestFileType.RAW_DATA,
            )

            fixture_util.add_direct_ingest_path(
                self.fs.gcs_file_system,
                file_path,
                region_code=self.test_region.region_code,
            )

            self.import_manager.import_raw_file_to_big_query(
                file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.assertEqual(
            str(e.exception),
            "Multiple columns with name [_4COL] after normalization.")

    def test_import_bq_file_with_migrations(self) -> None:
        file_datetime = migrations_tagC.DATE_1
        file_path = path_for_fixture_file_in_test_gcs_directory(
            bucket_path=self.ingest_bucket_path,
            filename="tagC.csv",
            should_normalize=True,
            file_type=GcsfsDirectIngestFileType.RAW_DATA,
            dt=file_datetime,
        )
        fixture_util.add_direct_ingest_path(
            self.fs.gcs_file_system,
            file_path,
            region_code=self.test_region.region_code)

        mock_query_jobs = [
            mock.MagicMock(),
            mock.MagicMock(),
        ]

        self.mock_big_query_client.run_query_async.side_effect = mock_query_jobs

        self.import_manager.import_raw_file_to_big_query(
            file_path, self._metadata_for_unprocessed_file_path(file_path))

        self.mock_big_query_client.run_query_async.assert_has_calls([
            mock.call(
                query_str="UPDATE `recidiviz-456.us_xx_raw_data.tagC` original\n"
                "SET COL1 = updates.new__COL1\n"
                "FROM (SELECT * FROM UNNEST([\n"
                "    STRUCT('123' AS COL1, CAST('2020-06-10T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1),\n"
                "    STRUCT('123' AS COL1, CAST('2020-09-21T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1)\n"
                "])) updates\n"
                "WHERE original.COL1 = updates.COL1 AND original.update_datetime = updates.update_datetime;"
            ),
            mock.call(
                query_str="DELETE FROM `recidiviz-456.us_xx_raw_data.tagC`\n"
                "WHERE STRUCT(COL1) IN (\n"
                "    STRUCT('789')\n"
                ");"),
        ])

        for mock_query_job in mock_query_jobs:
            mock_query_job.result.assert_called_once()
class GcsfsDirectIngestController(
        BaseDirectIngestController[GcsfsIngestArgs, GcsfsFileContentsHandle]):
    """Controller for parsing and persisting a file in the GCS filesystem."""

    _MAX_STORAGE_FILE_RENAME_TRIES = 10
    _DEFAULT_MAX_PROCESS_JOB_WAIT_TIME_SEC = 300
    _FILE_SPLIT_LINE_LIMIT = 2500

    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = GcsfsFactory.build()
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.file_split_line_limit = self._FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_temporary_output_directory_path()),
            big_query_client=BigQueryClientImpl()
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(self.region, self.get_file_tag_rank_list())
        )

    # ================= #
    # NEW FILE HANDLING #
    # ================= #
    def handle_file(self, path: GcsfsFilePath, start_ingest: bool):
        """Called when a single new file is added to an ingest bucket (may also
        be called as a result of a rename).

        May be called from any worker/queue.
        """
        if self.fs.is_processed_file(path):
            logging.info("File [%s] is already processed, returning.",
                         path.abs_path())
            return

        if self.fs.is_normalized_file_path(path):
            parts = filename_parts_from_path(path)

            if self.region.is_raw_vs_ingest_file_name_detection_enabled():
                if parts.file_type == GcsfsDirectIngestFileType.RAW_DATA or (
                        parts.file_type == GcsfsDirectIngestFileType.INGEST_VIEW and
                        self.region.are_ingest_view_exports_enabled_in_env()
                ):
                    # TODO(3020): Design/handle/write tests for case where this is a file we've moved from storage for a
                    #  rerun. Right now we will crash here because we'll try to set a discovery time that comes after
                    #  the processed time.
                    self.file_metadata_manager.register_new_file(path)

            if parts.is_file_split and \
                    parts.file_split_size and \
                    parts.file_split_size <= self.file_split_line_limit:
                self.kick_scheduler(just_finished_job=False)
                logging.info("File [%s] is already normalized and split split "
                             "with correct size, kicking scheduler.",
                             path.abs_path())
                return

        logging.info("Creating cloud task to schedule next job.")
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=self.region,
            can_start_ingest=start_ingest)

    def handle_new_files(self, can_start_ingest: bool):
        """Searches the ingest directory for new/unprocessed files. Normalizes
        file names and splits files as necessary, schedules the next ingest job
        if allowed.


        Should only be called from the scheduler queue.
        """
        unnormalized_paths = self.fs.get_unnormalized_file_paths(self.ingest_directory_path)

        unnormalized_path_file_type = GcsfsDirectIngestFileType.RAW_DATA \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED

        for path in unnormalized_paths:
            logging.info("File [%s] is not yet seen, normalizing.",
                         path.abs_path())
            self.fs.mv_path_to_normalized_path(path, file_type=unnormalized_path_file_type)

        if unnormalized_paths:
            logging.info(
                "Normalized at least one path - returning, will handle "
                "normalized files separately.")
            # Normalizing file paths will cause the cloud function that calls
            # this function to be re-triggered.
            return

        if self._schedule_any_pre_ingest_tasks():
            logging.info("Found pre-ingest tasks to schedule - returning.")
            return

        ingest_file_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None

        unprocessed_paths = self.fs.get_unprocessed_file_paths(self.ingest_directory_path,
                                                               file_type_filter=ingest_file_type_filter)

        did_split = False
        for path in unprocessed_paths:
            if self._split_file_if_necessary(path):
                did_split = True

        if did_split:
            logging.info(
                "Split at least one path - returning, will handle split "
                "files separately.")
            # Writing new split files to storage will cause the cloud function
            # that calls this function to be re-triggered.
            return

        if can_start_ingest and unprocessed_paths:
            self.schedule_next_ingest_job_or_wait_if_necessary(
                just_finished_job=False)

    def do_raw_data_import(self, data_import_args: GcsfsRawDataBQImportArgs) -> None:
        """Process a raw incoming file by importing it to BQ, tracking it in our metadata tables, and moving it to
        storage on completion.
        """
        check_is_region_launched_in_env(self.region)
        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            raise ValueError(f'Raw data imports not enabled for region [{self.region.region_code}]')

        if not self.fs.exists(data_import_args.raw_data_file_path):
            logging.warning(
                "File path [%s] no longer exists - might have already been "
                "processed or deleted", data_import_args.raw_data_file_path)
            self.cloud_task_manager.create_direct_ingest_handle_new_files_task(self.region, can_start_ingest=True)
            return

        file_metadata = self.file_metadata_manager.get_file_metadata(data_import_args.raw_data_file_path)

        if file_metadata.processed_time:
            logging.warning('File [%s] is already marked as processed. Skipping file processing.',
                            data_import_args.raw_data_file_path.file_name)
            self.cloud_task_manager.create_direct_ingest_handle_new_files_task(self.region, can_start_ingest=True)
            return

        self.raw_file_import_manager.import_raw_file_to_big_query(data_import_args.raw_data_file_path,
                                                                  file_metadata)

        if not self.region.are_ingest_view_exports_enabled_in_env():
            # TODO(3162) This is a stopgap measure for regions that have only partially launched. Delete once SQL
            #  pre-processing is enabled for all direct ingest regions.
            parts = filename_parts_from_path(data_import_args.raw_data_file_path)
            ingest_file_tags = self.get_file_tag_rank_list()

            if parts.file_tag in ingest_file_tags:
                self.fs.copy(
                    data_import_args.raw_data_file_path,
                    GcsfsFilePath.from_absolute_path(to_normalized_unprocessed_file_path_from_normalized_path(
                        data_import_args.raw_data_file_path.abs_path(),
                        file_type_override=GcsfsDirectIngestFileType.INGEST_VIEW
                    ))
                )

        processed_path = self.fs.mv_path_to_processed_path(data_import_args.raw_data_file_path)
        self.file_metadata_manager.mark_file_as_processed(path=data_import_args.raw_data_file_path)

        self.fs.mv_path_to_storage(processed_path, self.storage_directory_path)
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(self.region, can_start_ingest=True)

    def do_ingest_view_export(self, ingest_view_export_args: GcsfsIngestViewExportArgs) -> None:
        check_is_region_launched_in_env(self.region)
        if not self.region.are_ingest_view_exports_enabled_in_env():
            raise ValueError(f'Ingest view exports not enabled for region [{self.region.region_code}]. Passed args: '
                             f'{ingest_view_export_args}')

        did_export = self.ingest_view_export_manager.export_view_for_args(ingest_view_export_args)
        if not did_export or not self.file_metadata_manager.get_ingest_view_metadata_pending_export():
            logging.info("Creating cloud task to schedule next job.")
            self.cloud_task_manager.create_direct_ingest_handle_new_files_task(region=self.region,
                                                                               can_start_ingest=True)

    # ============== #
    # JOB SCHEDULING #
    # ============== #

    def _schedule_any_pre_ingest_tasks(self) -> bool:
        """Schedules any tasks related to SQL preprocessing of new files in preparation for ingest of those files into
        our Postgres database.

        Returns True if any jobs were scheduled or if there were already any pre-ingest jobs scheduled. Returns False if
        there are no remaining ingest jobs to schedule and it is safe to proceed with ingest.
        """
        if self._schedule_raw_data_import_tasks():
            logging.info("Found pre-ingest raw data import tasks to schedule.")
            return True
        # TODO(3020): We have logic to ensure that we wait 10 min for all files to upload properly before moving on to
        #  ingest. We probably actually need this to happen between raw data import and ingest view export steps - if we
        #  haven't seen all files yet and most recent raw data file came in sometime in the last 10 min, we should wait
        #  to do view exports.
        if self._schedule_ingest_view_export_tasks():
            logging.info("Found pre-ingest view export tasks to schedule.")
            return True
        return False

    def _schedule_raw_data_import_tasks(self) -> bool:
        if not self.region.are_raw_data_bq_imports_enabled_in_env():
            return False

        queue_info = self.cloud_task_manager.get_bq_import_export_queue_info(self.region)

        did_schedule = False
        tasks_to_schedule = [GcsfsRawDataBQImportArgs(path)
                             for path in self.raw_file_import_manager.get_unprocessed_raw_files_to_import()]
        for task_args in tasks_to_schedule:
            if not queue_info.has_task_already_scheduled(task_args):
                self.cloud_task_manager.create_direct_ingest_raw_data_import_task(self.region, task_args)
                did_schedule = True

        return queue_info.has_raw_data_import_jobs_queued() or did_schedule

    def _schedule_ingest_view_export_tasks(self) -> bool:
        if not self.region.are_ingest_view_exports_enabled_in_env():
            return False

        queue_info = self.cloud_task_manager.get_bq_import_export_queue_info(self.region)
        if queue_info.has_ingest_view_export_jobs_queued():
            # Since we schedule all export jobs at once, after all raw files have been processed, we wait for all of the
            # export jobs to be done before checking if we need to schedule more.
            return True

        did_schedule = False
        tasks_to_schedule = self.ingest_view_export_manager.get_ingest_view_export_task_args()

        rank_list = self.get_file_tag_rank_list()

        # Sort by tag order and export datetime
        tasks_to_schedule.sort(key=lambda args: (rank_list.index(args.ingest_view_name),
                                                 args.upper_bound_datetime_to_export))

        for task_args in tasks_to_schedule:
            if not queue_info.has_task_already_scheduled(task_args):
                self.cloud_task_manager.create_direct_ingest_ingest_view_export_task(self.region, task_args)
                did_schedule = True

        return did_schedule

    @classmethod
    @abc.abstractmethod
    def get_file_tag_rank_list(cls) -> List[str]:
        pass

    def _get_next_job_args(self) -> Optional[GcsfsIngestArgs]:
        return self.file_prioritizer.get_next_job_args()

    def _wait_time_sec_for_next_args(self, args: GcsfsIngestArgs) -> int:
        if self.file_prioritizer.are_next_args_expected(args):
            # Run job immediately
            return 0

        now = datetime.datetime.utcnow()
        file_upload_time: datetime.datetime = \
            filename_parts_from_path(args.file_path).utc_upload_datetime

        max_delay_sec = self.max_delay_sec_between_files \
            if self.max_delay_sec_between_files is not None \
            else self._DEFAULT_MAX_PROCESS_JOB_WAIT_TIME_SEC
        max_wait_from_file_upload_time = \
            file_upload_time + datetime.timedelta(seconds=max_delay_sec)

        if max_wait_from_file_upload_time <= now:
            wait_time = 0
        else:
            wait_time = (max_wait_from_file_upload_time - now).seconds

        logging.info("Waiting [%s] sec for [%s]",
                     wait_time, self._job_tag(args))
        return wait_time

    def _on_job_scheduled(self, ingest_args: GcsfsIngestArgs):
        pass

    # =================== #
    # SINGLE JOB RUN CODE #
    # =================== #

    def _job_tag(self, args: GcsfsIngestArgs) -> str:
        return f'{self.region.region_code}/{args.file_path.file_name}:' \
            f'{args.ingest_time}'

    def _get_contents_handle(
            self, args: GcsfsIngestArgs) -> Optional[GcsfsFileContentsHandle]:
        return self._get_contents_handle_from_path(args.file_path)

    def _get_contents_handle_from_path(
            self, path: GcsfsFilePath) -> Optional[GcsfsFileContentsHandle]:
        return self.fs.download_to_temp_file(path)

    @abc.abstractmethod
    def _are_contents_empty(self,
                            contents_handle: GcsfsFileContentsHandle) -> bool:
        pass

    def _can_proceed_with_ingest_for_contents(
            self,
            contents_handle: GcsfsFileContentsHandle):
        return self._are_contents_empty(contents_handle) or \
               self._file_meets_file_line_limit(contents_handle)

    @abc.abstractmethod
    def _file_meets_file_line_limit(
            self, contents_handle: GcsfsFileContentsHandle) -> bool:
        """Subclasses should implement to determine whether the file meets the
        expected line limit"""

    @abc.abstractmethod
    def _parse(self,
               args: GcsfsIngestArgs,
               contents_handle: GcsfsFileContentsHandle) -> IngestInfo:
        pass

    def _split_file_if_necessary(self, path: GcsfsFilePath):
        """Checks if the given file needs to be split according to this
        controller's |file_split_line_limit|.
        """
        parts = filename_parts_from_path(path)

        if self.region.is_raw_vs_ingest_file_name_detection_enabled() and \
                parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW:
            raise ValueError(f'Should not be attempting to split files other than ingest view files, found path with '
                             f'file type: {parts.file_type}')

        if parts.file_tag not in self.get_file_tag_rank_list():
            logging.info("File tag [%s] for path [%s] not in rank list - "
                         "not splitting.",
                         parts.file_tag,
                         path.abs_path())
            return False

        if parts.is_file_split and \
                parts.file_split_size and \
                parts.file_split_size <= self.file_split_line_limit:
            logging.info("File [%s] already split with size [%s].",
                         path.abs_path(), parts.file_split_size)
            return False

        file_contents_handle = self._get_contents_handle_from_path(path)

        if not file_contents_handle:
            logging.info("File [%s] has no rows - not splitting.",
                         path.abs_path())
            return False

        if self._can_proceed_with_ingest_for_contents(file_contents_handle):
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        split_contents_handles = self._split_file(path, file_contents_handle)

        original_metadata = None
        if self.region.are_ingest_view_exports_enabled_in_env():
            original_metadata = self.file_metadata_manager.get_file_metadata(path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)
        for i, split_contents_handle in enumerate(split_contents_handles):
            upload_path = self._create_split_file_path(path, output_dir, split_num=i)

            ingest_file_metadata = None

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not isinstance(original_metadata, DirectIngestIngestFileMetadata):
                    raise ValueError('Attempting to split a non-ingest view type file')

                ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(original_metadata,
                                                                                             upload_path)
            logging.info("Writing file split [%s] to Cloud Storage.", upload_path.abs_path())
            self.fs.upload_from_contents_handle(upload_path, split_contents_handle, self._contents_type())

            if self.region.are_ingest_view_exports_enabled_in_env():
                if not ingest_file_metadata:
                    raise ValueError(f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]')

                self.file_metadata_manager.mark_ingest_view_exported(ingest_file_metadata)

        if self.region.are_ingest_view_exports_enabled_in_env():
            self.file_metadata_manager.mark_file_as_processed(path)

        logging.info("Done splitting file [%s] into [%s] paths, moving it to storage.",
                     path.abs_path(), len(split_contents_handles))

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True

    def _create_split_file_path(self,
                                original_file_path: GcsfsFilePath,
                                output_dir: GcsfsDirectoryPath,
                                split_num: int) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f'{parts.stripped_file_name}_{rank_str}'
            f'_{SPLIT_FILE_SUFFIX}_size{self.file_split_line_limit}'
            f'.{parts.extension}')

        file_type = GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else GcsfsDirectIngestFileType.UNSPECIFIED

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                file_type=file_type,
                                                dt=parts.utc_upload_datetime))

    @abc.abstractmethod
    def _split_file(self,
                    path: GcsfsFilePath,
                    file_contents_handle: GcsfsFileContentsHandle) -> List[GcsfsFileContentsHandle]:
        """Should be implemented by subclasses to split a file accessible via the provided contents handle into multiple
        files in separate contents handles. """

    @abc.abstractmethod
    def _contents_type(self) -> str:
        """Returns the contents type for the contents this controller handles, e.g. 'text/csv'."""

    def _do_cleanup(self, args: GcsfsIngestArgs):
        self.fs.mv_path_to_processed_path(args.file_path)

        if self.region.are_ingest_view_exports_enabled_in_env():
            self.file_metadata_manager.mark_file_as_processed(args.file_path)

        parts = filename_parts_from_path(args.file_path)
        self._move_processed_files_to_storage_as_necessary(
            last_processed_date_str=parts.date_str)

    def _is_last_job_for_day(self, args: GcsfsIngestArgs) -> bool:
        """Returns True if the file handled in |args| is the last file for that
        upload date."""
        parts = filename_parts_from_path(args.file_path)
        upload_date, date_str = parts.utc_upload_datetime, parts.date_str
        more_jobs_expected = \
            self.file_prioritizer.are_more_jobs_expected_for_day(date_str)
        if more_jobs_expected:
            return False
        next_job_args = self.file_prioritizer.get_next_job_args(date_str)
        if next_job_args:
            next_job_date = filename_parts_from_path(
                next_job_args.file_path).utc_upload_datetime
            return next_job_date > upload_date
        return True

    def _move_processed_files_to_storage_as_necessary(
            self, last_processed_date_str: str):
        next_args = self.file_prioritizer.get_next_job_args()

        should_move_last_processed_date = False
        if not next_args:
            are_more_jobs_expected = \
                self.file_prioritizer.are_more_jobs_expected_for_day(
                    last_processed_date_str)
            if not are_more_jobs_expected:
                should_move_last_processed_date = True
        else:
            next_date_str = \
                filename_parts_from_path(next_args.file_path).date_str
            if next_date_str < last_processed_date_str:
                logging.info("Found a file [%s] from a date previous to our "
                             "last processed date - not moving anything to "
                             "storage.")
                return

            # If there are still more to process on this day, do not move files
            # from this day.
            should_move_last_processed_date = \
                next_date_str != last_processed_date_str

        # Note: at this point, we expect RAW file type files to already have been moved once they were imported to BQ.
        file_type_to_move = GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None

        self.fs.mv_processed_paths_before_date_to_storage(
            self.ingest_directory_path,
            self.storage_directory_path,
            file_type_filter=file_type_to_move,
            date_str_bound=last_processed_date_str,
            include_bound=should_move_last_processed_date)

    @staticmethod
    def file_tag(file_path: GcsfsFilePath) -> str:
        return filename_parts_from_path(file_path).file_tag
Example #13
0
def compare_raw_data_between_projects(
    region_code: str,
    source_project_id: str = environment.GCP_PROJECT_STAGING,
    comparison_project_id: str = environment.GCP_PROJECT_PRODUCTION,
) -> List[str]:
    """Compares the raw data between staging and production for a given region."""
    logging.info(
        "**** Ensuring all raw data for [%s] in [%s] also exists in [%s] ****",
        region_code.upper(),
        source_project_id,
        comparison_project_id,
    )

    raw_file_config = DirectIngestRegionRawFileConfig(region_code)

    bq_client = BigQueryClientImpl(project_id=source_project_id)
    dataset_id = DirectIngestRawFileImportManager.raw_tables_dataset_for_region(
        region_code)
    source_dataset = bq_client.dataset_ref_for_id(dataset_id)

    query_jobs: Dict[str, bigquery.QueryJob] = {}
    for file_tag, file_config in raw_file_config.raw_file_configs.items():
        if (not bq_client.table_exists(source_dataset, file_tag)
                or file_config.is_undocumented
                or not file_config.primary_key_cols):
            continue

        columns = ", ".join(
            [column.name for column in file_config.available_columns])

        query_job = bq_client.run_query_async(
            query_str=COMPARISON_TEMPLATE.format(
                source_project_id=source_project_id,
                comparison_project_id=comparison_project_id,
                raw_data_dataset_id=dataset_id,
                raw_data_table_id=file_tag,
                columns=columns,
            ))
        query_jobs[file_tag] = query_job

    table_column_width = min(
        max(len(tag) for tag in raw_file_config.raw_file_configs), 30)

    failed_tables: List[str] = []
    for file_tag in sorted(raw_file_config.raw_file_tags):
        justified_name = file_tag.ljust(table_column_width)

        if file_tag not in query_jobs:
            # This file did not exist in the project that is the source of truth.
            continue

        query_job = query_jobs[file_tag]
        try:
            rows = query_job.result()
        except exceptions.NotFound:
            logging.warning(
                "%s | Missing table %s.%s.%s",
                justified_name,
                comparison_project_id,
                dataset_id,
                file_tag,
            )
            failed_tables.append(file_tag)
            continue

        counts: List[Tuple[datetime.datetime,
                           int]] = [row.values() for row in rows]

        if counts:
            logging.warning(
                "%s | Missing data in the %s table",
                justified_name,
                comparison_project_id,
            )
            for update_datetime, num_missing in counts:
                logging.warning("\t%ss: %d", update_datetime.isoformat(),
                                num_missing)
            failed_tables.append(file_tag)
        else:
            logging.info(
                "%s | %s contains all of the data from %s",
                justified_name,
                comparison_project_id,
                source_project_id,
            )

    return failed_tables
Example #14
0
class BaseDirectIngestController(Ingestor):
    """Parses and persists individual-level info from direct ingest partners."""

    _INGEST_FILE_SPLIT_LINE_LIMIT = 2500

    def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None:
        """Initialize the controller."""
        self.cloud_task_manager = DirectIngestCloudTaskManagerImpl()
        self.ingest_instance = DirectIngestInstance.for_ingest_bucket(
            ingest_bucket_path)
        self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest(
            region_code=self.region.region_code,
            schema_type=self.system_level.schema_type(),
            ingest_instance=self.ingest_instance,
        )
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.ingest_bucket_path = ingest_bucket_path
        self.storage_directory_path = (
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code(),
                system_level=self.system_level,
                ingest_instance=self.ingest_instance,
            ))

        self.temp_output_directory_path = (
            gcsfs_direct_ingest_temporary_output_directory_path())

        self.file_prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs,
            self.ingest_bucket_path,
            self.get_file_tag_rank_list(),
        )

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code,
            ingest_database_name=self.ingest_database_key.db_name,
        )

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl(),
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            output_bucket_name=self.ingest_bucket_path.bucket_name,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()),
            launched_file_tags=self.get_file_tag_rank_list(),
        )

        self.ingest_instance_status_manager = DirectIngestInstanceStatusManager(
            self.region_code(), self.ingest_instance)

    @property
    def region(self) -> Region:
        return regions.get_region(self.region_code().lower(),
                                  is_direct_ingest=True)

    @classmethod
    @abc.abstractmethod
    def region_code(cls) -> str:
        pass

    @abc.abstractmethod
    def get_file_tag_rank_list(self) -> List[str]:
        pass

    @property
    def system_level(self) -> SystemLevel:
        return SystemLevel.for_region(self.region)

    @property
    def ingest_database_key(self) -> SQLAlchemyDatabaseKey:
        schema_type = self.system_level.schema_type()
        if schema_type == SchemaType.STATE:
            state_code = StateCode(self.region_code().upper())
            return SQLAlchemyDatabaseKey.for_state_code(
                state_code,
                self.ingest_instance.database_version(self.system_level,
                                                      state_code=state_code),
            )

        return SQLAlchemyDatabaseKey.for_schema(schema_type)

    # ============== #
    # JOB SCHEDULING #
    # ============== #
    def kick_scheduler(self, just_finished_job: bool) -> None:
        logging.info("Creating cloud task to schedule next job.")
        self.cloud_task_manager.create_direct_ingest_scheduler_queue_task(
            region=self.region,
            ingest_instance=self.ingest_instance,
            ingest_bucket=self.ingest_bucket_path,
            just_finished_job=just_finished_job,
        )

    def schedule_next_ingest_job(self, just_finished_job: bool) -> None:
        """Creates a cloud task to run a /process_job request for the file, which will
        process and commit the contents to Postgres."""
        check_is_region_launched_in_env(self.region)

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        if self._schedule_any_pre_ingest_tasks():
            logging.info("Found pre-ingest tasks to schedule - returning.")
            return

        if self.region_lock_manager.is_locked():
            logging.info("Direct ingest is already locked on region [%s]",
                         self.region)
            return

        process_job_queue_info = self.cloud_task_manager.get_process_job_queue_info(
            self.region,
            self.ingest_instance,
        )
        if (process_job_queue_info.tasks_for_instance(
                region_code=self.region_code(),
                ingest_instance=self.ingest_instance)
                and not just_finished_job):
            logging.info(
                "Already running job [%s] - will not schedule another job for "
                "region [%s]",
                process_job_queue_info.task_names[0],
                self.region.region_code,
            )
            return

        next_job_args = self._get_next_job_args()

        if not next_job_args:
            logging.info(
                "No more jobs to run for region [%s] - returning",
                self.region.region_code,
            )
            return

        if process_job_queue_info.is_task_queued(self.region, next_job_args):
            logging.info(
                "Already have task queued for next job [%s] - returning.",
                self._job_tag(next_job_args),
            )
            return

        if not self.region_lock_manager.can_proceed():
            logging.info(
                "Postgres to BigQuery export is running, cannot run ingest - returning"
            )
            return

        logging.info("Creating cloud task to run job [%s]",
                     self._job_tag(next_job_args))
        self.cloud_task_manager.create_direct_ingest_process_job_task(
            region=self.region,
            ingest_instance=self.ingest_instance,
            ingest_args=next_job_args,
        )
        self._on_job_scheduled(next_job_args)

    def _schedule_any_pre_ingest_tasks(self) -> bool:
        """Schedules any tasks related to SQL preprocessing of new files in preparation
         for ingest of those files into our Postgres database.

        Returns True if any jobs were scheduled or if there were already any pre-ingest
        jobs scheduled. Returns False if there are no remaining ingest jobs to schedule
        and it is safe to proceed with ingest.
        """
        if self._schedule_raw_data_import_tasks():
            logging.info("Found pre-ingest raw data import tasks to schedule.")
            return True
        if self._schedule_ingest_view_export_tasks():
            logging.info("Found pre-ingest view export tasks to schedule.")
            return True
        return False

    def _schedule_raw_data_import_tasks(self) -> bool:
        """Schedules all pending ingest view export tasks for launched ingest view tags,
        if they have not been scheduled. If tasks are scheduled or are still running,
        returns True. Otherwise, if it's safe to proceed with next steps of ingest,
        returns False."""
        queue_info = self.cloud_task_manager.get_bq_import_export_queue_info(
            self.region, self.ingest_instance)

        did_schedule = False
        tasks_to_schedule = [
            GcsfsRawDataBQImportArgs(path) for path in
            self.raw_file_import_manager.get_unprocessed_raw_files_to_import()
        ]
        for task_args in tasks_to_schedule:
            # If the file path has not actually been discovered by the metadata manager yet, it likely was just added
            # and a subsequent call to handle_files will register it and trigger another call to this function so we can
            # schedule the appropriate job.
            discovered = self.file_metadata_manager.has_raw_file_been_discovered(
                task_args.raw_data_file_path)
            # If the file path has been processed, but still in the GCS bucket, it's likely due
            # to either a manual move or an accidental duplicate uploading. In either case, we
            # trust the database to have the source of truth.
            processed = self.file_metadata_manager.has_raw_file_been_processed(
                task_args.raw_data_file_path)
            if processed:
                logging.warning(
                    "File [%s] is already marked as processed. Skipping file processing.",
                    task_args.raw_data_file_path,
                )
            if (discovered and not processed
                    and not queue_info.has_task_already_scheduled(task_args)):
                self.cloud_task_manager.create_direct_ingest_raw_data_import_task(
                    self.region, self.ingest_instance, task_args)
                did_schedule = True

        return queue_info.has_raw_data_import_jobs_queued() or did_schedule

    def _schedule_ingest_view_export_tasks(self) -> bool:
        """Schedules all pending ingest view export tasks for launched ingest view tags,
         if they have not been scheduled. If tasks are scheduled or are still running,
        returns True. Otherwise, if it's safe to proceed with next steps of ingest,
        returns False.
        """
        queue_info = self.cloud_task_manager.get_bq_import_export_queue_info(
            self.region, self.ingest_instance)
        if queue_info.has_ingest_view_export_jobs_queued():
            # Since we schedule all export jobs at once, after all raw files have been processed, we wait for all of the
            # export jobs to be done before checking if we need to schedule more.
            return True

        did_schedule = False
        tasks_to_schedule = (
            self.ingest_view_export_manager.get_ingest_view_export_task_args())

        rank_list = self.get_file_tag_rank_list()
        ingest_view_name_rank = {
            ingest_view_name: i
            for i, ingest_view_name in enumerate(rank_list)
        }

        # Filter out views that aren't in ingest view tags.
        filtered_tasks_to_schedule = []
        for args in tasks_to_schedule:
            if args.ingest_view_name not in ingest_view_name_rank:
                logging.warning(
                    "Skipping ingest view task export for [%s] - not in controller ingest tags.",
                    args.ingest_view_name,
                )
                continue
            filtered_tasks_to_schedule.append(args)

        tasks_to_schedule = filtered_tasks_to_schedule

        # Sort by tag order and export datetime
        tasks_to_schedule.sort(key=lambda args_: (
            ingest_view_name_rank[args_.ingest_view_name],
            args_.upper_bound_datetime_to_export,
        ))

        for task_args in tasks_to_schedule:
            if not queue_info.has_task_already_scheduled(task_args):
                self.cloud_task_manager.create_direct_ingest_ingest_view_export_task(
                    self.region, self.ingest_instance, task_args)
                did_schedule = True

        return did_schedule

    def _get_next_job_args(self) -> Optional[GcsfsIngestArgs]:
        """Returns args for the next ingest job, or None if there is nothing to process."""
        args = self.file_prioritizer.get_next_job_args()

        if not args:
            return None

        discovered = self.file_metadata_manager.has_ingest_view_file_been_discovered(
            args.file_path)

        if not discovered:
            # If the file path has not actually been discovered by the controller yet, it likely was just added and a
            # subsequent call to handle_files will register it and trigger another call to this function so we can
            # schedule the appropriate job.
            logging.info(
                "Found args [%s] for a file that has not been discovered by the metadata manager yet - not scheduling.",
                args,
            )
            return None

        return args

    def _on_job_scheduled(self, ingest_args: GcsfsIngestArgs) -> None:
        """Called from the scheduler queue when an individual direct ingest job
        is scheduled.
        """

    # =================== #
    # SINGLE JOB RUN CODE #
    # =================== #
    def default_job_lock_timeout_in_seconds(self) -> int:
        """This method can be overridden by subclasses that need more (or less)
        time to process jobs to completion, but by default enforces a
        one hour timeout on locks.

        Jobs may take longer than the alotted time, but if they do so, they
        will de facto relinquish their hold on the acquired lock."""
        return 3600

    def run_ingest_job_and_kick_scheduler_on_completion(
            self, args: GcsfsIngestArgs) -> None:
        check_is_region_launched_in_env(self.region)

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        if not self.region_lock_manager.can_proceed():
            logging.warning(
                "Postgres to BigQuery export is running, can not run ingest")
            raise GCSPseudoLockAlreadyExists(
                "Postgres to BigQuery export is running, can not run ingest")

        with self.region_lock_manager.using_region_lock(
                expiration_in_seconds=self.default_job_lock_timeout_in_seconds(
                ), ):
            should_schedule = self._run_ingest_job(args)

        if should_schedule:
            self.kick_scheduler(just_finished_job=True)
            logging.info("Done running task. Returning.")

    def _run_ingest_job(self, args: GcsfsIngestArgs) -> bool:
        """
        Runs the full ingest process for this controller - reading and parsing
        raw input data, transforming it to our schema, then writing to the
        database.
        Returns:
            True if we should try to schedule the next job on completion. False,
             otherwise.
        """
        check_is_region_launched_in_env(self.region)

        start_time = datetime.datetime.now()
        logging.info("Starting ingest for ingest run [%s]",
                     self._job_tag(args))

        contents_handle = self._get_contents_handle(args)

        if contents_handle is None:
            logging.warning(
                "Failed to get contents handle for ingest run [%s] - "
                "returning.",
                self._job_tag(args),
            )
            # If the file no-longer exists, we do want to kick the scheduler
            # again to pick up the next file to run. We expect this to happen
            # occasionally as a race when the scheduler picks up a file before
            # it has been properly moved.
            return True

        if not self._can_proceed_with_ingest_for_contents(
                args, contents_handle):
            logging.warning(
                "Cannot proceed with contents for ingest run [%s] - returning.",
                self._job_tag(args),
            )
            # If we get here, we've failed to properly split a file picked up
            # by the scheduler. We don't want to schedule a new job after
            # returning here, otherwise we'll get ourselves in a loop where we
            # continually try to schedule this file.
            return False

        logging.info("Successfully read contents for ingest run [%s]",
                     self._job_tag(args))

        if not self._are_contents_empty(args, contents_handle):
            self._parse_and_persist_contents(args, contents_handle)
        else:
            logging.warning(
                "Contents are empty for ingest run [%s] - skipping parse and "
                "persist steps.",
                self._job_tag(args),
            )

        self._do_cleanup(args)

        duration_sec = (datetime.datetime.now() - start_time).total_seconds()
        logging.info(
            "Finished ingest in [%s] sec for ingest run [%s].",
            str(duration_sec),
            self._job_tag(args),
        )

        return True

    @trace.span
    def _parse_and_persist_contents(
            self, args: GcsfsIngestArgs,
            contents_handle: GcsfsFileContentsHandle) -> None:
        """
        Runs the full ingest process for this controller for files with
        non-empty contents.
        """
        ii = self._parse(args, contents_handle)
        if not ii:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PARSE_ERROR,
                msg="No IngestInfo after parse.",
            )

        logging.info("Successfully parsed data for ingest run [%s]",
                     self._job_tag(args))

        ingest_info_proto = serialization.convert_ingest_info_to_proto(ii)

        logging.info(
            "Successfully converted ingest_info to proto for ingest "
            "run [%s]",
            self._job_tag(args),
        )

        ingest_metadata = self._get_ingest_metadata(args)
        persist_success = persistence.write(ingest_info_proto, ingest_metadata)

        if not persist_success:
            raise DirectIngestError(
                error_type=DirectIngestErrorType.PERSISTENCE_ERROR,
                msg="Persist step failed",
            )

        logging.info("Successfully persisted for ingest run [%s]",
                     self._job_tag(args))

    def _get_ingest_metadata(self, args: GcsfsIngestArgs) -> IngestMetadata:
        return IngestMetadata(
            region=self.region.region_code,
            jurisdiction_id=self.region.jurisdiction_id,
            ingest_time=args.ingest_time,
            enum_overrides=self.get_enum_overrides(),
            system_level=self.system_level,
            database_key=self.ingest_database_key,
        )

    def _job_tag(self, args: GcsfsIngestArgs) -> str:
        """Returns a (short) string tag to identify an ingest run in logs."""
        return (f"{self.region.region_code}/{args.file_path.file_name}:"
                f"{args.ingest_time}")

    def _get_contents_handle(
            self, args: GcsfsIngestArgs) -> Optional[GcsfsFileContentsHandle]:
        """Returns a handle to the contents allows us to iterate over the contents and
        also manages cleanup of resources once we are done with the contents.

        Will return None if the contents could not be read (i.e. if they no
        longer exist).
        """
        return self._get_contents_handle_from_path(args.file_path)

    def _get_contents_handle_from_path(
            self, path: GcsfsFilePath) -> Optional[GcsfsFileContentsHandle]:
        return self.fs.download_to_temp_file(path)

    @abc.abstractmethod
    def _are_contents_empty(self, args: GcsfsIngestArgs,
                            contents_handle: GcsfsFileContentsHandle) -> bool:
        """Should be overridden by subclasses to return True if the contents
        for the given args should be considered "empty" and not parsed. For
        example, a CSV might have a single header line but no actual data.
        """

    @abc.abstractmethod
    def _parse(
            self, args: GcsfsIngestArgs,
            contents_handle: GcsfsFileContentsHandle
    ) -> ingest_info.IngestInfo:
        """Parses ingest view file contents into an IngestInfo object."""

    def _do_cleanup(self, args: GcsfsIngestArgs) -> None:
        """Does necessary cleanup once file contents have been successfully persisted to
        Postgres.
        """
        self.fs.mv_path_to_processed_path(args.file_path)

        self.file_metadata_manager.mark_ingest_view_file_as_processed(
            args.file_path)

        parts = filename_parts_from_path(args.file_path)
        self._move_processed_files_to_storage_as_necessary(
            last_processed_date_str=parts.date_str)

    def _can_proceed_with_ingest_for_contents(
            self, args: GcsfsIngestArgs,
            contents_handle: GcsfsFileContentsHandle) -> bool:
        """Given a pointer to the contents, returns whether the controller can continue
        ingest.
        """
        parts = filename_parts_from_path(args.file_path)
        return self._are_contents_empty(
            args, contents_handle) or not self._must_split_contents(
                parts.file_type, args.file_path)

    def _must_split_contents(self, file_type: GcsfsDirectIngestFileType,
                             path: GcsfsFilePath) -> bool:
        if file_type == GcsfsDirectIngestFileType.RAW_DATA:
            return False

        return not self._file_meets_file_line_limit(
            self.ingest_file_split_line_limit, path)

    @abc.abstractmethod
    def _file_meets_file_line_limit(self, line_limit: int,
                                    path: GcsfsFilePath) -> bool:
        """Subclasses should implement to determine whether the file meets the
        expected line limit"""

    def _move_processed_files_to_storage_as_necessary(
            self, last_processed_date_str: str) -> None:
        """Moves files that have already been ingested/processed, up to and including the given date, into storage,
        if there is nothing more left to ingest/process, i.e. we are not expecting more files."""
        next_args = self.file_prioritizer.get_next_job_args()

        should_move_last_processed_date = False
        if not next_args:
            are_more_jobs_expected = (
                self.file_prioritizer.are_more_jobs_expected_for_day(
                    last_processed_date_str))
            if not are_more_jobs_expected:
                should_move_last_processed_date = True
        else:
            next_date_str = filename_parts_from_path(
                next_args.file_path).date_str
            if next_date_str < last_processed_date_str:
                logging.info("Found a file [%s] from a date previous to our "
                             "last processed date - not moving anything to "
                             "storage.")
                return

            # If there are still more to process on this day, do not move files
            # from this day.
            should_move_last_processed_date = next_date_str != last_processed_date_str

        # Note: at this point, we expect RAW file type files to already have been moved once they were imported to BQ.
        self.fs.mv_processed_paths_before_date_to_storage(
            self.ingest_bucket_path,
            self.storage_directory_path,
            file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
            date_str_bound=last_processed_date_str,
            include_bound=should_move_last_processed_date,
        )

    # ================= #
    # NEW FILE HANDLING #
    # ================= #
    def handle_file(self, path: GcsfsFilePath, start_ingest: bool) -> None:
        """Called when a single new file is added to an ingest bucket (may also
        be called as a result of a rename).

        May be called from any worker/queue.
        """
        if self.fs.is_processed_file(path):
            logging.info("File [%s] is already processed, returning.",
                         path.abs_path())
            return

        if self.fs.is_normalized_file_path(path):
            parts = filename_parts_from_path(path)

            if (parts.is_file_split and parts.file_split_size
                    and parts.file_split_size <=
                    self.ingest_file_split_line_limit):
                self.kick_scheduler(just_finished_job=False)
                logging.info(
                    "File [%s] is already normalized and split split "
                    "with correct size, kicking scheduler.",
                    path.abs_path(),
                )
                return

        logging.info("Creating cloud task to schedule next job.")
        self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
            region=self.region,
            ingest_instance=self.ingest_instance,
            ingest_bucket=self.ingest_bucket_path,
            can_start_ingest=start_ingest,
        )

    def _register_all_new_paths_in_metadata(
            self, paths: List[GcsfsFilePath]) -> None:
        for path in paths:
            parts = filename_parts_from_path(path)
            if parts.file_type == GcsfsDirectIngestFileType.RAW_DATA:
                if not self.file_metadata_manager.has_raw_file_been_discovered(
                        path):
                    self.file_metadata_manager.mark_raw_file_as_discovered(
                        path)
            elif parts.file_type == GcsfsDirectIngestFileType.INGEST_VIEW:
                if not self.file_metadata_manager.has_ingest_view_file_been_discovered(
                        path):
                    self.file_metadata_manager.mark_ingest_view_file_as_discovered(
                        path)
            else:
                raise ValueError(f"Unexpected file type [{parts.file_type}]")

    @trace.span
    def handle_new_files(self, can_start_ingest: bool) -> None:
        """Searches the ingest directory for new/unprocessed files. Normalizes
        file names and splits files as necessary, schedules the next ingest job
        if allowed.


        Should only be called from the scheduler queue.
        """
        if not can_start_ingest and self.region.is_ingest_launched_in_env():
            raise ValueError(
                "The can_start_ingest flag should only be used for regions where ingest is not yet launched in a "
                "particular environment. If we want to be able to selectively pause ingest processing for a state, we "
                "will first have to build a config that is respected by both the /ensure_all_raw_file_paths_normalized "
                "endpoint and any cloud functions that trigger ingest.")

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        unnormalized_paths = self.fs.get_unnormalized_file_paths(
            self.ingest_bucket_path)

        for path in unnormalized_paths:
            logging.info("File [%s] is not yet seen, normalizing.",
                         path.abs_path())
            self.fs.mv_path_to_normalized_path(
                path, file_type=GcsfsDirectIngestFileType.RAW_DATA)

        if unnormalized_paths:
            logging.info(
                "Normalized at least one path - returning, will handle "
                "normalized files separately.")
            # Normalizing file paths will cause the cloud function that calls
            # this function to be re-triggered.
            return

        if not can_start_ingest:
            logging.warning(
                "Ingest not configured to start post-file normalization - returning."
            )
            return

        check_is_region_launched_in_env(self.region)

        unprocessed_ingest_view_paths = self.fs.get_unprocessed_file_paths(
            self.ingest_bucket_path,
            file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
        )
        unprocessed_raw_paths = self.fs.get_unprocessed_file_paths(
            self.ingest_bucket_path,
            file_type_filter=GcsfsDirectIngestFileType.RAW_DATA,
        )
        if (unprocessed_raw_paths
                and self.ingest_instance == DirectIngestInstance.SECONDARY):
            raise ValueError(
                f"Raw data import not supported from SECONDARY ingest bucket "
                f"[{self.ingest_bucket_path}], but found {len(unprocessed_raw_paths)} "
                f"raw files. All raw files should be removed from this bucket and "
                f"uploaded to the primary ingest bucket, if appropriate.")

        self._register_all_new_paths_in_metadata(unprocessed_raw_paths)

        self._register_all_new_paths_in_metadata(unprocessed_ingest_view_paths)

        unprocessed_paths = unprocessed_raw_paths + unprocessed_ingest_view_paths
        did_split = False
        for path in unprocessed_ingest_view_paths:
            if self._split_file_if_necessary(path):
                did_split = True

        if did_split:
            post_split_unprocessed_ingest_view_paths = (
                self.fs.get_unprocessed_file_paths(
                    self.ingest_bucket_path,
                    file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW,
                ))
            self._register_all_new_paths_in_metadata(
                post_split_unprocessed_ingest_view_paths)

            logging.info(
                "Split at least one path - returning, will handle split "
                "files separately.")
            # Writing new split files to storage will cause the cloud function
            # that calls this function to be re-triggered.
            return

        if unprocessed_paths:
            self.schedule_next_ingest_job(just_finished_job=False)

    def do_raw_data_import(self,
                           data_import_args: GcsfsRawDataBQImportArgs) -> None:
        """Process a raw incoming file by importing it to BQ, tracking it in our metadata tables, and moving it to
        storage on completion.
        """
        check_is_region_launched_in_env(self.region)

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        if self.ingest_instance == DirectIngestInstance.SECONDARY:
            raise ValueError(
                f"Raw data import not supported from SECONDARY ingest bucket "
                f"[{self.ingest_bucket_path}]. Raw data task for "
                f"[{data_import_args.raw_data_file_path}] should never have been "
                f"scheduled.")

        if not self.fs.exists(data_import_args.raw_data_file_path):
            logging.warning(
                "File path [%s] no longer exists - might have already been "
                "processed or deleted",
                data_import_args.raw_data_file_path,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        file_metadata = self.file_metadata_manager.get_raw_file_metadata(
            data_import_args.raw_data_file_path)

        if file_metadata.processed_time:
            logging.warning(
                "File [%s] is already marked as processed. Skipping file processing.",
                data_import_args.raw_data_file_path.file_name,
            )
            self.kick_scheduler(just_finished_job=True)
            return

        self.raw_file_import_manager.import_raw_file_to_big_query(
            data_import_args.raw_data_file_path, file_metadata)

        processed_path = self.fs.mv_path_to_processed_path(
            data_import_args.raw_data_file_path)
        self.file_metadata_manager.mark_raw_file_as_processed(
            path=data_import_args.raw_data_file_path)

        self.fs.mv_path_to_storage(processed_path, self.storage_directory_path)
        self.kick_scheduler(just_finished_job=True)

    def do_ingest_view_export(
            self, ingest_view_export_args: GcsfsIngestViewExportArgs) -> None:
        check_is_region_launched_in_env(self.region)

        if self.ingest_instance_status_manager.is_instance_paused():
            logging.info("Ingest out of [%s] is currently paused.",
                         self.ingest_bucket_path.uri())
            return

        did_export = self.ingest_view_export_manager.export_view_for_args(
            ingest_view_export_args)
        if (not did_export or not self.file_metadata_manager.
                get_ingest_view_metadata_pending_export()):
            logging.info("Creating cloud task to schedule next job.")
            self.cloud_task_manager.create_direct_ingest_handle_new_files_task(
                region=self.region,
                ingest_instance=self.ingest_instance,
                ingest_bucket=self.ingest_bucket_path,
                can_start_ingest=True,
            )

    def _should_split_file(self, path: GcsfsFilePath) -> bool:
        """Returns a handle to the contents of this path if this file should be split, None otherwise."""
        parts = filename_parts_from_path(path)

        if parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW:
            raise ValueError(
                f"Should not be attempting to split files other than ingest view files, found path with "
                f"file type: {parts.file_type}")

        if parts.file_tag not in self.get_file_tag_rank_list():
            logging.info(
                "File tag [%s] for path [%s] not in rank list - not splitting.",
                parts.file_tag,
                path.abs_path(),
            )
            return False

        if (parts.is_file_split and parts.file_split_size and
                parts.file_split_size <= self.ingest_file_split_line_limit):
            logging.info(
                "File [%s] already split with size [%s].",
                path.abs_path(),
                parts.file_split_size,
            )
            return False

        return self._must_split_contents(parts.file_type, path)

    @trace.span
    def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool:
        """Checks if the given file needs to be split according to this controller's |file_split_line_limit|.

        Returns True if the file was split, False if splitting was not necessary.
        """

        should_split = self._should_split_file(path)
        if not should_split:
            logging.info("No need to split file path [%s].", path.abs_path())
            return False

        logging.info("Proceeding to file splitting for path [%s].",
                     path.abs_path())

        original_metadata = self.file_metadata_manager.get_ingest_view_file_metadata(
            path)

        output_dir = GcsfsDirectoryPath.from_file_path(path)

        split_contents_paths = self._split_file(path)
        upload_paths = []
        for i, split_contents_path in enumerate(split_contents_paths):
            upload_path = self._create_split_file_path(path,
                                                       output_dir,
                                                       split_num=i)

            logging.info(
                "Copying split [%s] to direct ingest directory at path [%s].",
                i,
                upload_path.abs_path(),
            )

            upload_paths.append(upload_path)
            try:
                self.fs.mv(split_contents_path, upload_path)
            except Exception as e:
                logging.error(
                    "Threw error while copying split files from temp bucket - attempting to clean up before rethrowing."
                    " [%s]",
                    e,
                )
                for p in upload_paths:
                    self.fs.delete(p)
                raise e

        # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving
        # the metadata manager in an inconsistent state.
        if not isinstance(original_metadata, DirectIngestIngestFileMetadata):
            raise ValueError("Attempting to split a non-ingest view type file")

        logging.info(
            "Registering [%s] split files with the metadata manager.",
            len(upload_paths),
        )

        for upload_path in upload_paths:
            ingest_file_metadata = (
                self.file_metadata_manager.register_ingest_file_split(
                    original_metadata, upload_path))
            self.file_metadata_manager.mark_ingest_view_exported(
                ingest_file_metadata)

        self.file_metadata_manager.mark_ingest_view_file_as_processed(path)

        logging.info(
            "Done splitting file [%s] into [%s] paths, moving it to storage.",
            path.abs_path(),
            len(split_contents_paths),
        )

        self.fs.mv_path_to_storage(path, self.storage_directory_path)

        return True

    def _create_split_file_path(
        self,
        original_file_path: GcsfsFilePath,
        output_dir: GcsfsDirectoryPath,
        split_num: int,
    ) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f"{parts.stripped_file_name}_{rank_str}"
            f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}"
            f".{parts.extension}")

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(
                updated_file_name,
                file_type=parts.file_type,
                dt=parts.utc_upload_datetime,
            ),
        )

    @abc.abstractmethod
    def _split_file(self, path: GcsfsFilePath) -> List[GcsfsFilePath]:
        """Should be implemented by subclasses to split a file accessible via the provided path into multiple
        files and upload those files to GCS. Returns the list of upload paths."""

    @staticmethod
    def file_tag(file_path: GcsfsFilePath) -> str:
        return filename_parts_from_path(file_path).file_tag