def __init__( self, region_code: str, dry_run: bool, ): self.region_code = region_code self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED self.dry_run = dry_run self.project_id = 'recidiviz-123' self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region_code, SystemLevel.STATE, project_id=self.project_id)) self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, GcsfsDirectIngestFileType.RAW_DATA, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_' f'{datetime.datetime.now().isoformat()}.txt') self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = GcsfsFactory.build() self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self._get_file_tag_rank_list()) self.file_split_line_limit = self._FILE_SPLIT_LINE_LIMIT
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = GcsfsFactory.build() self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.temp_output_directory_path = \ GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path()) ingest_job_file_type_filter = \ GcsfsDirectIngestFileType.INGEST_VIEW \ if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self.get_file_tag_rank_list(), ingest_job_file_type_filter) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl()) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()))
def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, file_filter: Optional[str]): self.file_type = file_type self.region_code = region_code self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.project_id = project_id self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, self.file_type, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}' f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt' ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def _split_file(self, path: GcsfsFilePath, file_contents_handle: GcsfsFileContentsHandle) -> None: output_dir = GcsfsDirectoryPath.from_file_path(path) upload_paths_and_df = [] for i, df in enumerate( pd.read_csv(file_contents_handle.local_file_path, dtype=str, chunksize=self.file_split_line_limit, keep_default_na=False)): upload_path = self._create_split_file_path(path, output_dir, split_num=i) upload_paths_and_df.append((upload_path, df)) for output_path, df in upload_paths_and_df: logging.info("Writing file split [%s] to Cloud Storage.", output_path.abs_path()) self.fs.upload_from_string(output_path, df.to_csv(index=False), 'text/csv') logging.info("Done splitting file [%s] into [%s] paths, returning.", path.abs_path(), len(upload_paths_and_df)) self.fs.mv_path_to_storage(path, self.storage_directory_path)
def _move_files(self, from_uri: str): curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri) previous_date_format = filename_parts_from_path( curr_gcsfs_file_path).date_str new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path): path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_processed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir( self.region_storage_raw_dir_path, new_date_format) to_uri = GcsfsFilePath.from_directory_and_file_name( raw_dir_with_date, path_with_new_file_name.file_name).uri() if not self.dry_run: gsutil_mv(from_path=from_uri, to_path=to_uri) with self.mutex: self.move_list.append((from_uri, to_uri)) if self.move_progress: self.move_progress.next()
def create_export_manager(self, region): metadata_manager = PostgresDirectIngestFileMetadataManager(region.region_code) return DirectIngestIngestViewExportManager( region=region, fs=FakeDirectIngestGCSFileSystem(), ingest_directory_path=GcsfsDirectoryPath.from_absolute_path('ingest_bucket'), big_query_client=self.mock_client, file_metadata_manager=metadata_manager, view_collector=_ViewCollector(region, controller_file_tags=['ingest_view']))
def gcs_export_directory(bucket_name: str, today: datetime.date, state_code: str) -> GcsfsDirectoryPath: """Returns a GCS directory to export files into, of the format: gs://{bucket_name}/ingested_state_data/{state_code}/{YYYY}/{MM}/{DD} """ path = GcsfsDirectoryPath.from_bucket_and_blob_name( bucket_name=bucket_name, blob_name= f'ingested_state_data/{state_code}/{today.year:04}/{today.month:02}/{today.day:02}/' ) return cast(GcsfsDirectoryPath, path)
def _copy_files_for_date(self, subdir_path_str: str): dir_path = GcsfsDirectoryPath.from_absolute_path(subdir_path_str.rstrip('/')) from_path = f'gs://{self.prod_region_storage_dir_path.bucket_name}/{dir_path.relative_path}*' to_path = f'gs://{self.staging_region_storage_dir_path.bucket_name}/{dir_path.relative_path}' if not self.dry_run: gsutil_cp(from_path=from_path, to_path=to_path) with self.mutex: self.copy_list.append((from_path, to_path)) if self.copy_progress: self.copy_progress.next()
def setUp(self) -> None: self.project_id = 'recidiviz-456' self.test_region = fake_region( region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True) self.fs = FakeDirectIngestGCSFileSystem() self.ingest_directory_path = GcsfsDirectoryPath( bucket_name='direct/controllers/fixtures') self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket') self.region_raw_file_config = DirectIngestRegionRawFileConfig( region_code='us_xx', yaml_config_file_path=fixtures.as_filepath( 'us_xx_raw_data_files.yaml'), ) self.mock_big_query_client = create_autospec(BigQueryClient) self.num_lines_uploaded = 0 self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \ self.mock_import_raw_file_to_big_query self.import_manager = DirectIngestRawFileImportManager( region=self.test_region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client) self.import_manager.csv_reader = TestSafeGcsCsvReader(self.fs) self.time_patcher = patch( 'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time' ) self.mock_time = self.time_patcher.start() def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference: return bigquery.DatasetReference(project=self.project_id, dataset_id=dataset_id) self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
def __init__(self, project_id: str, region: str, file_type_to_move: GcsfsDirectIngestFileType, destination_file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str]): self.project_id = project_id self.region = region self.file_type_to_move = file_type_to_move self.destination_file_type = destination_file_type if self.file_type_to_move != self.destination_file_type and \ self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED: raise ValueError( 'Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED' ) self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_' f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt' )
def __init__(self, region_code: str, file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool): self.file_type = file_type self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id='recidiviz-123')) self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id='recidiviz-staging')) self.dry_run = dry_run self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.log_output_path = os.path.join( os.path.dirname(__file__), f'copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_' f'{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt' ) self.mutex = threading.Lock() self.copy_list: List[Tuple[str, str]] = [] self.copy_progress: Optional[Bar] = None
def __init__(self, paths: str, project_id: str, region: str, date: str, dry_run: bool): self.paths = paths self.project_id = project_id self.region = region.lower() self.datetime = datetime.datetime.fromisoformat(date) self.dry_run = dry_run self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) self.mutex = threading.Lock() self.move_progress: Optional[Bar] = None self.copies_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f'upload_to_ingest_result_{region}_{self.project_id}_date_{self.datetime.date().isoformat()}' f'_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt' )
def _move_files_for_date(self, subdir_path_str: str): """Function that loops through each subdirectory and moves files in each subdirectory using the from path and to path specified.""" from_dir_path = GcsfsDirectoryPath.from_absolute_path( subdir_path_str.rstrip('/')) previous_date_format = from_dir_path.relative_path.rstrip('/').split( '/')[-1] new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") from_paths = gsutil_ls(f'{subdir_path_str}*.csv') for from_path in from_paths: file_name = GcsfsFilePath( bucket_name=self.region_storage_dir_path.bucket_name, blob_name=from_path).file_name to_file_path = os.path.join( 'gs://', self.region_storage_dir_path.bucket_name, self.region_code, GcsfsDirectIngestFileType.RAW_DATA.value, new_date_format, file_name) normalized_to_file_path = to_normalized_processed_file_path_from_normalized_path( to_file_path, file_type_override=GcsfsDirectIngestFileType.RAW_DATA) to_path = normalized_to_file_path if not self.dry_run: gsutil_mv(from_path=from_path, to_path=to_path) with self.mutex: self.move_list.append((from_path, to_path)) if self.move_progress: self.move_progress.next()
class DirectIngestRawFileImportManagerTest(unittest.TestCase): """Tests for DirectIngestRawFileImportManager.""" def setUp(self) -> None: self.project_id = 'recidiviz-456' self.test_region = fake_region( region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=True) self.fs = FakeDirectIngestGCSFileSystem() self.ingest_directory_path = GcsfsDirectoryPath( bucket_name='direct/controllers/fixtures') self.temp_output_path = GcsfsDirectoryPath(bucket_name='temp_bucket') self.region_raw_file_config = DirectIngestRegionRawFileConfig( region_code='us_xx', yaml_config_file_path=fixtures.as_filepath( 'us_xx_raw_data_files.yaml'), ) self.mock_big_query_client = create_autospec(BigQueryClient) self.num_lines_uploaded = 0 self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = \ self.mock_import_raw_file_to_big_query self.import_manager = DirectIngestRawFileImportManager( region=self.test_region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client) self.import_manager.csv_reader = TestSafeGcsCsvReader(self.fs) self.time_patcher = patch( 'recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time' ) self.mock_time = self.time_patcher.start() def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference: return bigquery.DatasetReference(project=self.project_id, dataset_id=dataset_id) self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref def tearDown(self) -> None: self.time_patcher.stop() def mock_import_raw_file_to_big_query( self, *, source_uri: str, destination_table_schema: List[bigquery.SchemaField], **_kwargs): col_names = [ schema_field.name for schema_field in destination_table_schema ] temp_path = GcsfsFilePath.from_absolute_path(source_uri) local_temp_path = self.fs.uploaded_test_path_to_actual[ temp_path.abs_path()] df = pd.read_csv(local_temp_path, header=None, dtype=str) for value in df.values: for cell in value: if isinstance(cell, str): stripped_cell = cell.strip() if stripped_cell != cell: raise ValueError( 'Did not strip white space from raw data cell') if cell in col_names: raise ValueError( f'Wrote column row to output file: {value}') self.num_lines_uploaded += len(df) return mock.MagicMock() def _metadata_for_unprocessed_file_path( self, path: GcsfsFilePath) -> DirectIngestFileMetadata: parts = filename_parts_from_path(path) return DirectIngestFileMetadata( region_code=self.test_region.region_code, file_tag=parts.file_tag, file_id=123, processed_time=None) def _check_no_temp_files_remain(self): for path in self.fs.all_paths: if path.abs_path().startswith(self.temp_output_path.abs_path()): self.fail( f'Expected temp path {path.abs_path()} to be cleaned up') def test_get_unprocessed_raw_files_to_import(self): self.assertEqual( [], self.import_manager.get_unprocessed_raw_files_to_import()) raw_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_second.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.INGEST_VIEW) self.fs.test_add_path(raw_unprocessed) self.fs.test_add_path(ingest_view_unprocessed) self.assertEqual( [raw_unprocessed], self.import_manager.get_unprocessed_raw_files_to_import()) def test_import_bq_file_not_in_tags(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='this_path_tag_not_in_yaml.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata)) def test_import_bq_file_with_ingest_view_file(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.INGEST_VIEW) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata)) def test_import_bq_file_with_unspecified_type_file(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.UNSPECIFIED) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata)) def test_import_bq_file_feature_not_released_throws(self): self.import_manager = DirectIngestRawFileImportManager( region=fake_region(region_code='us_xx', are_raw_data_bq_imports_enabled_in_env=False), fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client) file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='file_tag_first.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) with self.assertRaises(ValueError): self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestFileMetadata)) def test_import_bq_file_with_raw_file(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagC.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) self.fs.test_add_path(file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual)) path = one(self.fs.uploaded_test_path_to_actual.keys()) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=f'gs://{path}', destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagC', destination_table_schema=[ bigquery.SchemaField('COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) self.assertEqual(2, self.num_lines_uploaded) self._check_no_temp_files_remain() def test_import_bq_file_with_raw_file_alternate_separator_and_encoding( self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagPipeSeparatedNonUTF8.txt', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) self.fs.test_add_path(file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.uploaded_test_path_to_actual)) path = one(self.fs.uploaded_test_path_to_actual.keys()) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=f'gs://{path}', destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagPipeSeparatedNonUTF8', destination_table_schema=[ bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain() def test_import_bq_file_multiple_chunks_even_division(self): self.import_manager.upload_chunk_size = 1 file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagPipeSeparatedNonUTF8.txt', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) self.fs.test_add_path(file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(5, len(self.fs.uploaded_test_path_to_actual)) expected_insert_calls = [ call.insert_into_table_from_cloud_storage_async( source_uri=f'gs://{uploaded_path}', destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagPipeSeparatedNonUTF8', destination_table_schema=[ bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) for uploaded_path in self.fs.uploaded_test_path_to_actual ] self.assertEqual(expected_insert_calls, self.mock_big_query_client.method_calls) self.assertEqual( len(expected_insert_calls) - 1, self.mock_time.sleep.call_count) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain() def test_import_bq_file_multiple_chunks_uneven_division(self): self.import_manager.upload_chunk_size = 2 file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagPipeSeparatedNonUTF8.txt', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) self.fs.test_add_path(file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(3, len(self.fs.uploaded_test_path_to_actual)) expected_insert_calls = [ call.insert_into_table_from_cloud_storage_async( source_uri=f'gs://{uploaded_path}', destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagPipeSeparatedNonUTF8', destination_table_schema=[ bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) for uploaded_path in self.fs.uploaded_test_path_to_actual ] self.assertEqual(expected_insert_calls, self.mock_big_query_client.method_calls) self.assertEqual( len(expected_insert_calls) - 1, self.mock_time.sleep.call_count) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_processing_continues_if_there_are_subfolders_in_ingest_dir(self): controller = build_gcsfs_controller_for_tests( StateTestGcsfsDirectIngestController, self.FIXTURE_PATH_PREFIX, run_async=False) if not isinstance(controller.fs, FakeDirectIngestGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeDirectIngestGCSFileSystem. Found instead " f"type [{type(controller.fs)}]") subdir_path = \ path_for_fixture_file(controller, f'subdir/', should_normalize=False) paths = [ subdir_path, path_for_fixture_file(controller, f'subdir/Unexpected_Tag.csv', should_normalize=False), path_for_fixture_file(controller, f'tagA.csv', should_normalize=False), path_for_fixture_file(controller, f'tagB.csv', should_normalize=False), path_for_fixture_file(controller, f'tagC.csv', should_normalize=False), path_for_fixture_file(controller, f'subdir/tagC_2.csv', should_normalize=False), ] for path in paths: controller.fs.test_add_path(path) run_task_queues_to_empty(controller) dir_paths_found = [] storage_file_paths = [] ingest_file_paths = [] for path in controller.fs.all_paths: if isinstance(path, GcsfsDirectoryPath): dir_paths_found.append(path) continue if path.abs_path().startswith( controller.storage_directory_path.abs_path()): storage_file_paths.append(path) else: self.assertTrue(path.abs_path().startswith( controller.ingest_directory_path.abs_path())) ingest_file_paths.append(path) self.assertEqual(1, len(dir_paths_found)) self.assertEqual(subdir_path, dir_paths_found[0]) self.assertEqual(3, len(storage_file_paths)) storage_tags = { filename_parts_from_path(path).file_tag for path in storage_file_paths } self.assertEqual({'tagA', 'tagB', 'tagC'}, storage_tags) for path in storage_file_paths: self.assertTrue(controller.fs.is_normalized_file_path(path)) self.assertTrue(controller.fs.is_processed_file(path)) self.assertEqual(2, len(ingest_file_paths)) ingest_tags = { filename_parts_from_path(path).file_tag for path in ingest_file_paths } self.assertEqual({'tagC', 'Unexpected_Tag'}, ingest_tags) for path in ingest_file_paths: self.assertTrue(controller.fs.is_normalized_file_path(path)) self.assertTrue(controller.fs.is_seen_unprocessed_file(path)) self.assertEqual(subdir_path, GcsfsDirectoryPath.from_file_path(path))
def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool: """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. Returns True if the file was split, False if splitting was not necessary. """ should_split = self._should_split_file(path) if not should_split: logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) original_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): original_metadata = self.file_metadata_manager.get_file_metadata( path) output_dir = GcsfsDirectoryPath.from_file_path(path) split_contents_paths = self._split_file(path) for i, split_contents_path in enumerate(split_contents_paths): upload_path = self._create_split_file_path(path, output_dir, split_num=i) ingest_file_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): if not isinstance(original_metadata, DirectIngestIngestFileMetadata): raise ValueError( 'Attempting to split a non-ingest view type file') ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split( original_metadata, upload_path) logging.info( "Copying split [%s] to direct ingest directory at path [%s].", i, upload_path.abs_path()) self.fs.mv(split_contents_path, upload_path) if self.region.are_ingest_view_exports_enabled_in_env(): if not ingest_file_metadata: raise ValueError( f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]' ) self.file_metadata_manager.mark_ingest_view_exported( ingest_file_metadata) if self.region.are_ingest_view_exports_enabled_in_env(): self.file_metadata_manager.mark_file_as_processed(path) logging.info( "Done splitting file [%s] into [%s] paths, moving it to storage.", path.abs_path(), len(split_contents_paths)) self.fs.mv_path_to_storage(path, self.storage_directory_path) return True
def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool: """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. Returns True if the file was split, False if splitting was not necessary. """ should_split = self._should_split_file(path) if not should_split: logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) original_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): original_metadata = self.file_metadata_manager.get_file_metadata( path) output_dir = GcsfsDirectoryPath.from_file_path(path) split_contents_paths = self._split_file(path) upload_paths = [] for i, split_contents_path in enumerate(split_contents_paths): upload_path = self._create_split_file_path(path, output_dir, split_num=i) logging.info( "Copying split [%s] to direct ingest directory at path [%s].", i, upload_path.abs_path()) upload_paths.append(upload_path) try: self.fs.mv(split_contents_path, upload_path) except Exception as e: logging.error( 'Threw error while copying split files from temp bucket - attempting to clean up before rethrowing.' ' [%s]', e) for p in upload_paths: self.fs.delete(p) raise e # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving # the metadata manager in an inconsistent state. if self.region.are_ingest_view_exports_enabled_in_env(): if not isinstance(original_metadata, DirectIngestIngestFileMetadata): raise ValueError( 'Attempting to split a non-ingest view type file') logging.info( 'Registering [%s] split files with the metadata manager.', len(upload_paths)) for upload_path in upload_paths: ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split( original_metadata, upload_path) self.file_metadata_manager.mark_ingest_view_exported( ingest_file_metadata) self.file_metadata_manager.mark_file_as_processed(path) logging.info( "Done splitting file [%s] into [%s] paths, moving it to storage.", path.abs_path(), len(split_contents_paths)) self.fs.mv_path_to_storage(path, self.storage_directory_path) return True
class TestGcsfsDirectIngestJobPrioritizer(unittest.TestCase): """Tests for the GcsfsDirectIngestJobPrioritizer.""" _DAY_1_TIME_1 = datetime.datetime(year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=6789, tzinfo=datetime.timezone.utc) _DAY_1_TIME_2 = datetime.datetime(year=2019, month=1, day=2, hour=3, minute=4, second=5, microsecond=7789, tzinfo=datetime.timezone.utc) _DAY_1_TIME_3 = datetime.datetime(year=2019, month=1, day=2, hour=10, minute=4, second=5, microsecond=678, tzinfo=datetime.timezone.utc) _DAY_2_TIME_1 = datetime.datetime(year=2019, month=1, day=3, hour=3, minute=4, second=5, microsecond=6789, tzinfo=datetime.timezone.utc) _DAY_1 = _DAY_1_TIME_1.date() _DAY_2 = _DAY_2_TIME_1.date() _INGEST_BUCKET_PATH = \ GcsfsDirectoryPath.from_absolute_path('direct/regions/us_nd/fixtures') def setUp(self) -> None: self.fs = FakeDirectIngestGCSFileSystem() self.prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self._INGEST_BUCKET_PATH, ['tagA', 'tagB']) FIXTURE_PATH_PREFIX = 'direct/regions/us_nd/fixtures' def _normalized_path_for_filename(self, filename: str, dt: datetime.datetime) -> GcsfsFilePath: normalized_path = \ to_normalized_unprocessed_file_path( os.path.join(self._INGEST_BUCKET_PATH.abs_path(), filename), dt) return GcsfsFilePath.from_absolute_path(normalized_path) def _process_jobs_for_paths_with_no_gaps_in_expected_order( self, paths: List[GcsfsFilePath]): for path in paths: date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) if next_job_args is None: # Make mypy happy self.fail() self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day(date_str)) # ... job runs ... self.fs.mv_path_to_processed_path(path) def test_empty_fs(self): self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1_TIME_1.date().isoformat())) self.assertIsNone(self.prioritizer.get_next_job_args()) def test_single_expected_file(self): path = self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1) self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order([path]) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagB self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_multiple_files(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2) ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_unexpected_file(self): # Only file is out of order path = self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1) self.fs.test_add_path(path) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertFalse( self.prioritizer.are_next_args_expected(next_job_args)) # ... job runs eventually even though unexpected... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagA self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_files_on_multiple_days(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat())) def test_files_on_multiple_days_with_gap(self): """Runs a test where there are files on multiple days and there is a gap in the expected files for the first day. """ paths = [ self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagA.csv', self._DAY_2_TIME_1), ] for path in paths: self.fs.test_add_path(path) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) are_args_expected = \ self.prioritizer.are_next_args_expected(next_job_args) if i == 0: self.assertFalse(are_args_expected) else: self.assertTrue(are_args_expected) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day(date_str)) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat())) def test_multiple_files_same_tag(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_multiple_files_times_out_of_order(self): """Runs a test where there are no gaps but the files have been added (i.e. have creation times) out of order. """ paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) are_more_jobs_expected = \ self.prioritizer.are_more_jobs_expected_for_day(date_str) if i == 2: self.assertFalse(are_more_jobs_expected) else: self.assertTrue(are_more_jobs_expected) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) def test_run_multiple_copies_of_same_tag(self): paths = [ self._normalized_path_for_filename('tagA.csv', self._DAY_1_TIME_2), self._normalized_path_for_filename('tagA_2.csv', self._DAY_1_TIME_1), self._normalized_path_for_filename('tagB.csv', self._DAY_1_TIME_3), ] for path in paths: self.fs.test_add_path(path) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat()))
class TestFakeDirectIngestGcsFileSystem(TestCase): """Tests for the DirectIngestGCSFileSystem.""" STORAGE_DIR_PATH = GcsfsDirectoryPath(bucket_name='storage_bucket', relative_path='region_subdir') INGEST_DIR_PATH = GcsfsDirectoryPath(bucket_name='my_bucket') def setUp(self) -> None: self.fs = FakeDirectIngestGCSFileSystem() def fully_process_file(self, dt: datetime.datetime, path: GcsfsFilePath, file_type_differentiation_on: bool = False): """Mimics all the file system calls for a single file in the direct ingest system, from getting added to the ingest bucket, turning to a processed file, then getting moved to storage.""" self.fs.test_add_path(path) start_num_total_files = len(self.fs.all_paths) # pylint: disable=protected-access start_ingest_paths = self.fs._ls_with_file_prefix( self.INGEST_DIR_PATH, '', None) start_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', None) if file_type_differentiation_on: start_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) start_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: start_raw_storage_paths = [] start_ingest_view_storage_paths = [] # File is renamed to normalized path file_type = GcsfsDirectIngestFileType.RAW_DATA \ if file_type_differentiation_on else GcsfsDirectIngestFileType.UNSPECIFIED self.fs.mv_path_to_normalized_path(path, file_type, dt) if file_type_differentiation_on: raw_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(len(raw_unprocessed), 1) self.assertTrue( self.fs.is_seen_unprocessed_file(raw_unprocessed[0])) # ... raw file imported to BQ processed_path = self.fs.mv_path_to_processed_path( raw_unprocessed[0]) processed = self.fs.get_processed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.fs.copy( processed_path, GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( processed_path.abs_path(), file_type_override=GcsfsDirectIngestFileType. INGEST_VIEW))) self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH) ingest_unprocessed_filter = GcsfsDirectIngestFileType.INGEST_VIEW if file_type_differentiation_on else None ingest_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=ingest_unprocessed_filter) self.assertEqual(len(ingest_unprocessed), 1) self.assertTrue(self.fs.is_seen_unprocessed_file( ingest_unprocessed[0])) # ... file is ingested # File is moved to processed path self.fs.mv_path_to_processed_path(ingest_unprocessed[0]) processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.assertTrue(self.fs.is_processed_file(processed[0])) unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(unprocessed), 0) # File is moved to storage ingest_move_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \ if file_type_differentiation_on else None self.fs.mv_processed_paths_before_date_to_storage( self.INGEST_DIR_PATH, self.STORAGE_DIR_PATH, date_str_bound=dt.date().isoformat(), include_bound=True, file_type_filter=ingest_move_type_filter) end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH, '', file_type_filter=None) end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) if file_type_differentiation_on: end_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) end_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: end_raw_storage_paths = [] end_ingest_view_storage_paths = [] # Each file gets re-exported as ingest view splitting_factor = 2 if file_type_differentiation_on else 1 expected_final_total_files = start_num_total_files + splitting_factor - 1 self.assertEqual(len(self.fs.all_paths), expected_final_total_files) self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1) self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1 * splitting_factor) if file_type_differentiation_on: self.assertEqual( len(end_raw_storage_paths) + len(end_ingest_view_storage_paths), len(end_storage_paths)) self.assertEqual(len(end_raw_storage_paths), len(start_raw_storage_paths) + 1) self.assertEqual(len(end_ingest_view_storage_paths), len(start_ingest_view_storage_paths) + 1) for sp in end_storage_paths: parts = filename_parts_from_path(sp) if sp.abs_path() not in { p.abs_path() for p in start_storage_paths }: self.assertTrue(sp.abs_path().startswith( self.STORAGE_DIR_PATH.abs_path())) dir_path, storage_file_name = os.path.split(sp.abs_path()) if parts.file_type != GcsfsDirectIngestFileType.UNSPECIFIED: self.assertTrue(parts.file_type.value in dir_path) name, _ = path.file_name.split('.') self.assertTrue(name in storage_file_name) def test_direct_ingest_file_moves(self): self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) def test_direct_ingest_multiple_file_moves(self): self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv')) def test_move_to_storage_with_conflict(self): dt = datetime.datetime.now() self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file( dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 2) found_first_file = False found_second_file = False for path in storage_paths: self.assertTrue(filename_parts_from_path(path)) if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file) def test_direct_ingest_file_moves_with_file_types(self): self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) def test_direct_ingest_multiple_file_moves_with_file_types(self): self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) self.fully_process_file(datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv'), file_type_differentiation_on=True) def test_move_to_storage_with_conflict_with_file_types(self): dt = datetime.datetime.now() self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file(dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv'), file_type_differentiation_on=True) # pylint: disable=protected-access storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) self.assertEqual(len(storage_paths), 4) found_first_file = False found_second_file = False for path in storage_paths: if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)
class TestDirectIngestGcsFileSystem(TestCase): """Tests for the DirectIngestGCSFileSystem.""" STORAGE_DIR_PATH = GcsfsDirectoryPath(bucket_name='storage_bucket', relative_path='region_subdir') INGEST_DIR_PATH = GcsfsDirectoryPath(bucket_name='my_bucket') def fully_process_file(self, test_fs: FakeDirectIngestGCSFileSystem, dt: datetime.datetime, path: GcsfsFilePath): """Mimics all the file system calls for a single file in the direct ingest system, from getting added to the ingest bucket, turning to a processed file, then getting moved to storage.""" test_fs.test_add_path(path) start_num_total_files = len(test_fs.all_paths) # pylint: disable=protected-access start_ingest_paths = test_fs._ls_with_file_prefix( self.INGEST_DIR_PATH, '') start_storage_paths = test_fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '') # File is renamed to normalized path test_fs.mv_path_to_normalized_path(path, dt) unprocessed = test_fs.get_unprocessed_file_paths(self.INGEST_DIR_PATH) self.assertEqual(len(unprocessed), 1) self.assertTrue(test_fs.is_seen_unprocessed_file(unprocessed[0])) # ... file is processed # File is moved to processed path test_fs.mv_path_to_processed_path(unprocessed[0]) processed = test_fs.get_processed_file_paths(self.INGEST_DIR_PATH) self.assertEqual(len(processed), 1) self.assertTrue(test_fs.is_processed_file(processed[0])) unprocessed = test_fs.get_unprocessed_file_paths(self.INGEST_DIR_PATH) self.assertEqual(len(unprocessed), 0) # File is moved to storage test_fs.mv_processed_paths_before_date_to_storage( self.INGEST_DIR_PATH, self.STORAGE_DIR_PATH, dt.date().isoformat(), include_bound=True) end_ingest_paths = test_fs._ls_with_file_prefix( self.INGEST_DIR_PATH, '') end_storage_paths = test_fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '') self.assertEqual(len(test_fs.all_paths), start_num_total_files) self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1) self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1) for sp in end_storage_paths: if sp.abs_path() not in \ {p.abs_path() for p in start_storage_paths}: self.assertTrue(sp.abs_path().startswith( self.STORAGE_DIR_PATH.abs_path())) _, storage_file_name = \ os.path.split(sp.abs_path()) name, _ = path.file_name.split('.') self.assertTrue(name in storage_file_name) def test_direct_ingest_file_moves(self): test_fs = FakeDirectIngestGCSFileSystem() self.fully_process_file( test_fs, datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) def test_direct_ingest_multiple_file_moves(self): test_fs = FakeDirectIngestGCSFileSystem() self.fully_process_file( test_fs, datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) self.fully_process_file( test_fs, datetime.datetime.now(), GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file_2.csv')) def test_move_to_storage_with_conflict(self): test_fs = FakeDirectIngestGCSFileSystem() dt = datetime.datetime.now() self.fully_process_file( test_fs, dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # Try uploading a file with a duplicate name that has already been # moved to storage self.fully_process_file( test_fs, dt, GcsfsFilePath(bucket_name='my_bucket', blob_name='test_file.csv')) # pylint: disable=protected-access storage_paths = test_fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '') self.assertEqual(len(storage_paths), 2) found_first_file = False found_second_file = False for path in storage_paths: if path.abs_path().endswith('test_file.csv'): found_first_file = True if path.abs_path().endswith('test_file-(1).csv'): found_second_file = True self.assertTrue(found_first_file) self.assertTrue(found_second_file)
def _split_file_if_necessary(self, path: GcsfsFilePath): """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. """ parts = filename_parts_from_path(path) if self.region.is_raw_vs_ingest_file_name_detection_enabled() and \ parts.file_type != GcsfsDirectIngestFileType.INGEST_VIEW: raise ValueError(f'Should not be attempting to split files other than ingest view files, found path with ' f'file type: {parts.file_type}') if parts.file_tag not in self.get_file_tag_rank_list(): logging.info("File tag [%s] for path [%s] not in rank list - " "not splitting.", parts.file_tag, path.abs_path()) return False if parts.is_file_split and \ parts.file_split_size and \ parts.file_split_size <= self.file_split_line_limit: logging.info("File [%s] already split with size [%s].", path.abs_path(), parts.file_split_size) return False file_contents_handle = self._get_contents_handle_from_path(path) if not file_contents_handle: logging.info("File [%s] has no rows - not splitting.", path.abs_path()) return False if self._can_proceed_with_ingest_for_contents(file_contents_handle): logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) split_contents_handles = self._split_file(path, file_contents_handle) original_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): original_metadata = self.file_metadata_manager.get_file_metadata(path) output_dir = GcsfsDirectoryPath.from_file_path(path) for i, split_contents_handle in enumerate(split_contents_handles): upload_path = self._create_split_file_path(path, output_dir, split_num=i) ingest_file_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): if not isinstance(original_metadata, DirectIngestIngestFileMetadata): raise ValueError('Attempting to split a non-ingest view type file') ingest_file_metadata = self.file_metadata_manager.register_ingest_file_split(original_metadata, upload_path) logging.info("Writing file split [%s] to Cloud Storage.", upload_path.abs_path()) self.fs.upload_from_contents_handle(upload_path, split_contents_handle, self._contents_type()) if self.region.are_ingest_view_exports_enabled_in_env(): if not ingest_file_metadata: raise ValueError(f'Split file metadata for path unexpectedly none [{upload_path.abs_path()}]') self.file_metadata_manager.mark_ingest_view_exported(ingest_file_metadata) if self.region.are_ingest_view_exports_enabled_in_env(): self.file_metadata_manager.mark_file_as_processed(path) logging.info("Done splitting file [%s] into [%s] paths, moving it to storage.", path.abs_path(), len(split_contents_handles)) self.fs.mv_path_to_storage(path, self.storage_directory_path) return True