def test_import_wrong_separator_cols_do_not_parse(self) -> None: file_config = self.import_manager.region_raw_file_config.raw_file_configs[ "tagC"] updated_file_config = attr.evolve(file_config, separator="#") self.import_manager.region_raw_file_config.raw_file_configs[ "tagC"] = updated_file_config file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagC.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) with self.assertRaises(ValueError) as e: self.import_manager.import_raw_file_to_big_query( file_path, create_autospec(DirectIngestRawFileMetadata)) self.assertTrue( str(e.exception).startswith( "Found only one column: [COL1__COL2_COL3]. " "Columns likely did not parse properly."))
def test_unexpected_file(self): # Only file is out of order path = self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_1) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) next_job_args = self.prioritizer.get_next_job_args() self.assertIsNotNone(next_job_args) self.assertEqual(next_job_args.file_path, path) self.assertFalse( self.prioritizer.are_next_args_expected(next_job_args)) # ... job runs eventually even though unexpected... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagA self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat()))
def test_import_bq_file_with_raw_file_invalid_column_chars(self): file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagInvalidCharacters.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths)) path = one(self.fs.gcs_file_system.uploaded_paths) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagInvalidCharacters', destination_table_schema=[ bigquery.SchemaField('COL_1', 'STRING', 'NULLABLE'), bigquery.SchemaField('_COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('_3COL', 'STRING', 'NULLABLE'), bigquery.SchemaField('_4_COL', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) self.assertEqual(1, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_files_on_multiple_days(self): paths = [ self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_1), self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_2), self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_2_TIME_1), # This file shouldn't ge tpicked up self._normalized_path_for_filename( 'tagC.csv', GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_3) ] for path in paths: fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) # Exclude last raw file expected_processed_paths = paths[0:-1] self._process_jobs_for_paths_with_no_gaps_in_expected_order( expected_processed_paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat()))
def test_files_on_multiple_days(self) -> None: paths = [ self._normalized_path_for_filename( "tagA.csv", GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_1), self._normalized_path_for_filename( "tagB.csv", GcsfsDirectIngestFileType.RAW_DATA, self._DAY_1_TIME_2), self._normalized_path_for_filename( "tagA.csv", GcsfsDirectIngestFileType.UNSPECIFIED, self._DAY_2_TIME_1), ] for path in paths: fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat()))
def test_run_full_ingest_all_files(self): file_tags = sorted(self.controller.get_file_tag_rank_list()) file_path = path_for_fixture_file( self.controller, 'VERABrazosJailData_01012019_115703.csv', False) fixture_util.add_direct_ingest_path(self.controller.fs.gcs_file_system, file_path) process_task_queues(self, self.controller, file_tags)
def test_get_unprocessed_raw_files_to_import(self) -> None: self.assertEqual( [], self.import_manager.get_unprocessed_raw_files_to_import()) raw_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename="file_tag_first.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) ingest_view_unprocessed = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename="file_tag_second.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.INGEST_VIEW, ) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, raw_unprocessed, has_fixture=False) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, ingest_view_unprocessed, has_fixture=False) self.assertEqual( [raw_unprocessed], self.import_manager.get_unprocessed_raw_files_to_import())
def test_import_bq_file_with_raw_file_invalid_column_chars(self) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename="tagInvalidCharacters.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths)) path = one(self.fs.gcs_file_system.uploaded_paths) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, "us_xx_raw_data"), destination_table_id="tagInvalidCharacters", destination_table_schema=[ bigquery.SchemaField("COL_1", "STRING", "NULLABLE"), bigquery.SchemaField("_COL2", "STRING", "NULLABLE"), bigquery.SchemaField("_3COL", "STRING", "NULLABLE"), bigquery.SchemaField("_4_COL", "STRING", "NULLABLE"), bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"), bigquery.SchemaField("update_datetime", "DATETIME", "REQUIRED"), ], ) self.assertEqual(1, self.num_lines_uploaded) self._check_no_temp_files_remain()
def export_query_results_to_cloud_storage( self, export_configs: List[ExportQueryConfig]) -> None: for export_config in export_configs: export_path = GcsfsFilePath.from_absolute_path( export_config.output_uri) fixture_util.add_direct_ingest_path(self.fs, export_path) self.exported_file_tags.append( filename_parts_from_path(export_path).file_tag)
def run_parse_file_test(self, expected: IngestInfo, fixture_file_name: str) -> IngestInfo: """Runs a test that reads and parses a given fixture file. Returns the parsed IngestInfo object for tests to run further validations.""" args = ingest_args_for_fixture_file(self.controller, f"{fixture_file_name}.csv") if not isinstance(self.controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError( f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(self.controller.fs.gcs_file_system)}]") if self.controller.region.is_ingest_launched_in_env(): now = datetime.datetime.now() yesterday = now - datetime.timedelta(days=1) ingest_file_export_job_args = GcsfsIngestViewExportArgs( ingest_view_name=fixture_file_name, upper_bound_datetime_to_export=now, upper_bound_datetime_prev=yesterday, output_bucket_name=self.controller.ingest_bucket_path. bucket_name, ) self.controller.file_metadata_manager.register_ingest_file_export_job( ingest_file_export_job_args) self.controller.ingest_view_export_manager.export_view_for_args( ingest_file_export_job_args) else: fixture_util.add_direct_ingest_path( self.controller.fs.gcs_file_system, args.file_path, region_code=self.controller.region_code(), ) # pylint:disable=protected-access fixture_contents_handle = self.controller._get_contents_handle(args) if fixture_contents_handle is None: self.fail("fixture_contents_handle should not be None") final_info = self.controller._parse(args, fixture_contents_handle) print_visible_header_label("FINAL") print(final_info) print_visible_header_label("EXPECTED") print(expected) self.assertEqual(expected, final_info) return final_info
def test_single_expected_file(self): path = self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_1) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) self._process_jobs_for_paths_with_no_gaps_in_expected_order([path]) self.assertIsNone(self.prioritizer.get_next_job_args()) # We still expect a file for tagB self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat()))
def test_import_bq_file_with_raw_file_normalization_conflict(self): with self.assertRaises(ValueError) as e: file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagNormalizationConflict.csv', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual( str(e.exception), "Multiple columns with name [_4COL] after normalization.")
def test_files_on_multiple_days_with_gap(self) -> None: """Runs a test where there are files on multiple days and there is a gap in the expected files for the first day. """ paths = [ self._normalized_path_for_filename( "tagB.csv", GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_2), self._normalized_path_for_filename( "tagA.csv", GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_2_TIME_1), ] for path in paths: fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() if next_job_args is None: self.fail("Next job args unexpectedly None") self.assertEqual(next_job_args.file_path, path) are_args_expected = self.prioritizer.are_next_args_expected( next_job_args) if i == 0: self.assertFalse(are_args_expected) else: self.assertTrue(are_args_expected) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day(date_str)) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat())) self.assertTrue( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_2.isoformat()))
def test_import_bq_file_multiple_chunks_uneven_division(self) -> None: self.import_manager.upload_chunk_size = 2 file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagPipeSeparatedNonUTF8.txt", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths)) expected_insert_calls = [ call( source_uri=uploaded_path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, "us_xx_raw_data"), destination_table_id="tagPipeSeparatedNonUTF8", destination_table_schema=[ bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"), bigquery.SchemaField("COL2", "STRING", "NULLABLE"), bigquery.SchemaField("COL3", "STRING", "NULLABLE"), bigquery.SchemaField("COL4", "STRING", "NULLABLE"), bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"), bigquery.SchemaField("update_datetime", "DATETIME", "REQUIRED"), ], ) for uploaded_path in self.fs.gcs_file_system.uploaded_paths ] self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls( expected_insert_calls, any_order=True) self.assertEqual( len(expected_insert_calls) - 1, self.mock_time.sleep.call_count) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_multiple_files_times_out_of_order(self) -> None: """Runs a test where there are no gaps but the files have been added (i.e. have creation times) out of order. """ paths = [ self._normalized_path_for_filename( "tagA.csv", GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_2), self._normalized_path_for_filename( "tagB.csv", GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_1), self._normalized_path_for_filename( "tagB.csv", GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_3), ] for path in paths: fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) for i, path in enumerate(paths): date_str = filename_parts_from_path(path).date_str next_job_args = self.prioritizer.get_next_job_args() if next_job_args is None: self.fail("Next job args unexpectedly None") self.assertEqual(next_job_args.file_path, path) self.assertTrue( self.prioritizer.are_next_args_expected(next_job_args)) are_more_jobs_expected = self.prioritizer.are_more_jobs_expected_for_day( date_str) if i == 2: self.assertFalse(are_more_jobs_expected) else: self.assertTrue(are_more_jobs_expected) # ... job runs ... self.fs.mv_path_to_processed_path(path) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat()))
def test_import_bq_file_with_migrations(self) -> None: file_datetime = migrations_tagC.DATE_1 file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagC.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, dt=file_datetime, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) mock_query_jobs = [ mock.MagicMock(), mock.MagicMock(), ] self.mock_big_query_client.run_query_async.side_effect = mock_query_jobs self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.mock_big_query_client.run_query_async.assert_has_calls([ mock.call( query_str="UPDATE `recidiviz-456.us_xx_raw_data.tagC` original\n" "SET COL1 = updates.new__COL1\n" "FROM (SELECT * FROM UNNEST([\n" " STRUCT('123' AS COL1, CAST('2020-06-10T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1),\n" " STRUCT('123' AS COL1, CAST('2020-09-21T00:00:00' AS DATETIME) AS update_datetime, '456' AS new__COL1)\n" "])) updates\n" "WHERE original.COL1 = updates.COL1 AND original.update_datetime = updates.update_datetime;" ), mock.call( query_str="DELETE FROM `recidiviz-456.us_xx_raw_data.tagC`\n" "WHERE STRUCT(COL1) IN (\n" " STRUCT('789')\n" ");"), ]) for mock_query_job in mock_query_jobs: mock_query_job.result.assert_called_once()
def add_paths_with_tags(controller: GcsfsDirectIngestController, file_tags: List[str], pre_normalize_filename: bool = False, file_type=GcsfsDirectIngestFileType.UNSPECIFIED): if not isinstance(controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(controller.fs.gcs_file_system)}]") for file_tag in file_tags: file_path = path_for_fixture_file( controller, f'{file_tag}.csv', should_normalize=pre_normalize_filename, file_type=file_type) # Only get a fixture path if it is a file, if it is a directory leave it as None fixture_util.add_direct_ingest_path(controller.fs.gcs_file_system, file_path) time.sleep(.05)
def test_import_bq_file_with_row_extra_columns(self) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagRowExtraColumns.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) with self.assertRaisesRegex(ParserError, "Expected 4 fields in line 3, saw 5"): self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(0, len(self.fs.gcs_file_system.uploaded_paths)) self._check_no_temp_files_remain()
def test_import_bq_file_multiple_chunks_uneven_division(self): self.import_manager.upload_chunk_size = 2 file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename='tagPipeSeparatedNonUTF8.txt', should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(3, len(self.fs.gcs_file_system.uploaded_paths)) expected_insert_calls = [ call(source_uri=uploaded_path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, 'us_xx_raw_data'), destination_table_id='tagPipeSeparatedNonUTF8', destination_table_schema=[ bigquery.SchemaField('PRIMARY_COL1', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL2', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL3', 'STRING', 'NULLABLE'), bigquery.SchemaField('COL4', 'STRING', 'NULLABLE'), bigquery.SchemaField('file_id', 'INTEGER', 'REQUIRED'), bigquery.SchemaField('update_datetime', 'DATETIME', 'REQUIRED') ]) for uploaded_path in self.fs.gcs_file_system.uploaded_paths ] self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_has_calls( expected_insert_calls, any_order=True) self.assertEqual( len(expected_insert_calls) - 1, self.mock_time.sleep.call_count) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def test_run_multiple_copies_of_same_tag(self): paths = [ self._normalized_path_for_filename( 'tagA.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_2), self._normalized_path_for_filename( 'tagA_2.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_1), self._normalized_path_for_filename( 'tagB.csv', GcsfsDirectIngestFileType.INGEST_VIEW, self._DAY_1_TIME_3), ] for path in paths: fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) self._process_jobs_for_paths_with_no_gaps_in_expected_order(paths) self.assertIsNone(self.prioritizer.get_next_job_args()) self.assertFalse( self.prioritizer.are_more_jobs_expected_for_day( self._DAY_1.isoformat()))
def test_import_bq_file_with_multibyte_raw_file_alternate_separator_and_encoding( self, ) -> None: file_path = path_for_fixture_file_in_test_gcs_directory( bucket_path=self.ingest_bucket_path, filename="tagDoubleDaggerWINDOWS1252.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, ) fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, file_path, region_code=self.test_region.region_code) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.assertEqual(1, len(self.fs.gcs_file_system.uploaded_paths)) path = one(self.fs.gcs_file_system.uploaded_paths) self.mock_big_query_client.insert_into_table_from_cloud_storage_async.assert_called_with( source_uri=path.uri(), destination_dataset_ref=bigquery.DatasetReference( self.project_id, "us_xx_raw_data"), destination_table_id="tagDoubleDaggerWINDOWS1252", destination_table_schema=[ bigquery.SchemaField("PRIMARY_COL1", "STRING", "NULLABLE"), bigquery.SchemaField("COL2", "STRING", "NULLABLE"), bigquery.SchemaField("COL3", "STRING", "NULLABLE"), bigquery.SchemaField("COL4", "STRING", "NULLABLE"), bigquery.SchemaField("file_id", "INTEGER", "REQUIRED"), bigquery.SchemaField("update_datetime", "DATETIME", "REQUIRED"), ], ) self.assertEqual(5, self.num_lines_uploaded) self._check_no_temp_files_remain()
def add_paths_with_tags( controller: BaseDirectIngestController, file_tags: List[str], pre_normalized_file_type: Optional[GcsfsDirectIngestFileType] = None, ) -> None: if not isinstance(controller.fs.gcs_file_system, FakeGCSFileSystem): raise ValueError(f"Controller fs must have type " f"FakeGCSFileSystem. Found instead " f"type [{type(controller.fs.gcs_file_system)}]") for file_tag in file_tags: file_path = path_for_fixture_file( controller, f"{file_tag}.csv", should_normalize=bool(pre_normalized_file_type), file_type=pre_normalized_file_type, ) # Only get a fixture path if it is a file, if it is a directory leave it as None fixture_util.add_direct_ingest_path( controller.fs.gcs_file_system, file_path, region_code=controller.region_code(), ) time.sleep(0.05)
def test_import_bq_file_with_migrations(self) -> None: file_datetime = migrations_tagC.DATE_1 file_path = path_for_fixture_file_in_test_gcs_directory( directory=self.ingest_directory_path, filename="tagC.csv", should_normalize=True, file_type=GcsfsDirectIngestFileType.RAW_DATA, dt=file_datetime, ) fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, file_path) self.import_manager.import_raw_file_to_big_query( file_path, self._metadata_for_unprocessed_file_path(file_path)) self.mock_big_query_client.run_query_async.assert_has_calls([ mock.call( query_str= f"UPDATE `recidiviz-456.us_xx_raw_data.tagC` SET COL1 = '456' WHERE COL1 = '123' AND update_datetime = '{file_datetime.isoformat()}';" ), mock.call( query_str= "DELETE FROM `recidiviz-456.us_xx_raw_data.tagC` WHERE COL1 = '789';" ), ])
def test_run_full_ingest_all_files(self): file_tags = sorted(self.controller.get_file_tag_rank_list()) file_path = path_for_fixture_file( self.controller, 'MDC_VERA_20200303_01.csv', False) fixture_util.add_direct_ingest_path(self.controller.fs.gcs_file_system, file_path) process_task_queues(self, self.controller, file_tags)
def fully_process_file(self, dt: datetime.datetime, path: GcsfsFilePath, file_type_differentiation_on: bool = False) -> None: """Mimics all the file system calls for a single file in the direct ingest system, from getting added to the ingest bucket, turning to a processed file, then getting moved to storage.""" fixture_util.add_direct_ingest_path(self.fs.gcs_file_system, path, has_fixture=False) start_num_total_files = len(self.fs.gcs_file_system.all_paths) # pylint: disable=protected-access start_ingest_paths = self.fs._ls_with_file_prefix( self.INGEST_DIR_PATH, '', None) start_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', None) if file_type_differentiation_on: start_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) start_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: start_raw_storage_paths = [] start_ingest_view_storage_paths = [] # File is renamed to normalized path file_type = GcsfsDirectIngestFileType.RAW_DATA \ if file_type_differentiation_on else GcsfsDirectIngestFileType.UNSPECIFIED self.fs.mv_path_to_normalized_path(path, file_type, dt) if file_type_differentiation_on: raw_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(len(raw_unprocessed), 1) self.assertTrue( self.fs.is_seen_unprocessed_file(raw_unprocessed[0])) # ... raw file imported to BQ processed_path = self.fs.mv_path_to_processed_path( raw_unprocessed[0]) processed = self.fs.get_processed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.fs.copy( processed_path, GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( processed_path.abs_path(), file_type_override=GcsfsDirectIngestFileType. INGEST_VIEW))) self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH) ingest_unprocessed_filter = GcsfsDirectIngestFileType.INGEST_VIEW if file_type_differentiation_on else None ingest_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=ingest_unprocessed_filter) self.assertEqual(len(ingest_unprocessed), 1) self.assertTrue(self.fs.is_seen_unprocessed_file( ingest_unprocessed[0])) # ... file is ingested # File is moved to processed path self.fs.mv_path_to_processed_path(ingest_unprocessed[0]) processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.assertTrue(self.fs.is_processed_file(processed[0])) unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(unprocessed), 0) # File is moved to storage ingest_move_type_filter = GcsfsDirectIngestFileType.INGEST_VIEW \ if file_type_differentiation_on else None self.fs.mv_processed_paths_before_date_to_storage( self.INGEST_DIR_PATH, self.STORAGE_DIR_PATH, date_str_bound=dt.date().isoformat(), include_bound=True, file_type_filter=ingest_move_type_filter) end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH, '', file_type_filter=None) end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, '', file_type_filter=None) if file_type_differentiation_on: end_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.RAW_DATA) end_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, '', file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) else: end_raw_storage_paths = [] end_ingest_view_storage_paths = [] # Each file gets re-exported as ingest view splitting_factor = 2 if file_type_differentiation_on else 1 expected_final_total_files = start_num_total_files + splitting_factor - 1 self.assertEqual(len(self.fs.gcs_file_system.all_paths), expected_final_total_files) self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1) self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1 * splitting_factor) if file_type_differentiation_on: self.assertEqual( len(end_raw_storage_paths) + len(end_ingest_view_storage_paths), len(end_storage_paths)) self.assertEqual(len(end_raw_storage_paths), len(start_raw_storage_paths) + 1) self.assertEqual(len(end_ingest_view_storage_paths), len(start_ingest_view_storage_paths) + 1) for sp in end_storage_paths: parts = filename_parts_from_path(sp) if sp.abs_path() not in { p.abs_path() for p in start_storage_paths }: self.assertTrue(sp.abs_path().startswith( self.STORAGE_DIR_PATH.abs_path())) dir_path, storage_file_name = os.path.split(sp.abs_path()) if parts.file_type != GcsfsDirectIngestFileType.UNSPECIFIED: self.assertTrue(parts.file_type.value in dir_path) name, _ = path.file_name.split('.') self.assertTrue(name in storage_file_name)