def test_read_completely_empty_file(self) -> None: empty_file_path = fixtures.as_filepath("tagA.csv") delegate = _TestGcsfsCsvReaderDelegate() self.reader.streaming_read( GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1, ) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding) self.assertEqual(0, len(delegate.dataframes)) self.assertEqual(0, delegate.decode_errors) self.assertEqual(0, delegate.exceptions) delegate = _TestGcsfsCsvReaderDelegate() self.reader.streaming_read( GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10, ) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding) self.assertEqual(0, len(delegate.dataframes)) self.assertEqual(0, delegate.decode_errors) self.assertEqual(0, delegate.exceptions)
def test_load_files_from_storage(self, mock_gcs_factory: MagicMock) -> None: """Test that load_files_from_storage returns files for the current batch and bucket name""" bucket_name = "bucket-name" self.mock_utils.get_email_content_bucket_name.return_value = bucket_name email_path = GcsfsFilePath.from_absolute_path( f"gs://{bucket_name}/{self.state_code}/{self.batch_id}/{self.to_address}.html" ) other_path = GcsfsFilePath.from_absolute_path( f"gs://{bucket_name}/excluded/exclude.json" ) fake_gcs_file_system = FakeGCSFileSystem() fake_gcs_file_system.upload_from_string( path=email_path, contents="<html>", content_type="text/html" ) fake_gcs_file_system.upload_from_string( path=other_path, contents="{}", content_type="text/json" ) mock_gcs_factory.return_value = fake_gcs_file_system files = email_delivery.load_files_from_storage( bucket_name, f"{self.state_code}/{self.batch_id}" ) self.assertEqual(files, {f"{self.to_address}": "<html>"})
def update_data_freshness_results(self) -> None: """Refreshes information in the metadata store about freshness of ingested data for all states.""" bq_export_config = CloudSqlToBQConfig.for_schema_type( SchemaType.STATE, yaml_path=GcsfsFilePath.from_absolute_path( f"gs://{self.project_id}-configs/cloud_sql_to_bq_config.yaml" ), ) if bq_export_config is None: raise ValueError("STATE CloudSqlToBQConfig unexpectedly None.") regions_paused = bq_export_config.region_codes_to_exclude latest_upper_bounds_path = GcsfsFilePath.from_absolute_path( f"gs://{self.project_id}-ingest-metadata/ingest_metadata_latest_ingested_upper_bounds.json" ) latest_upper_bounds_json = self.gcs_fs.download_as_string( latest_upper_bounds_path ) latest_upper_bounds = [] for line in latest_upper_bounds_json.splitlines(): line = line.strip() if not line: continue struct = json.loads(line) latest_upper_bounds.append( { "state": struct["state_code"], "date": struct.get("processed_date"), "ingestPaused": struct["state_code"] in regions_paused, } ) self.data_freshness_results = latest_upper_bounds
def _move_files(self, from_uri: str): curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri) previous_date_format = filename_parts_from_path( curr_gcsfs_file_path).date_str new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path): path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_processed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir( self.region_storage_raw_dir_path, new_date_format) to_uri = GcsfsFilePath.from_directory_and_file_name( raw_dir_with_date, path_with_new_file_name.file_name).uri() if not self.dry_run: gsutil_mv(from_path=from_uri, to_path=to_uri) with self.mutex: self.move_list.append((from_uri, to_uri)) if self.move_progress: self.move_progress.next()
def test_is_task_queued_has_tasks(self): # Arrange file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) gcsfs_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=GcsfsFilePath.from_absolute_path(file_path)) full_task_name = \ _build_task_id(_REGION.region_code, gcsfs_args.task_id_tag()) info = ProcessIngestJobCloudTaskQueueInfo( queue_name='queue_name', task_names=[ 'projects/path/to/random_task', f'projects/path/to/{full_task_name}' ]) file_path = to_normalized_unprocessed_file_path( 'bucket/file_path.csv', GcsfsDirectIngestFileType.INGEST_VIEW) gcsfs_args = \ GcsfsIngestArgs( ingest_time=datetime.datetime.now(), file_path=GcsfsFilePath.from_absolute_path(file_path)) # Act gcsfs_args_queued = info.is_task_queued(_REGION, gcsfs_args) # Assert self.assertTrue(gcsfs_args_queued)
def test_read_file_with_columns_no_contents(self): empty_file_path = fixtures.as_filepath('tagB.csv') delegate = TestGcsfsCsvReaderDelegate() self.reader.streaming_read( GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=1) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding) self.assertEqual(1, len(delegate.dataframes)) encoding, df = delegate.dataframes[0] self.assertEqual(encoding, delegate.successful_encoding) self.assertEqual(0, df.shape[0]) # No rows self.assertEqual(7, df.shape[1]) # 7 columns self.assertEqual(0, delegate.decode_errors) self.assertEqual(0, delegate.exceptions) delegate = TestGcsfsCsvReaderDelegate() self.reader.streaming_read( GcsfsFilePath.from_absolute_path(empty_file_path), delegate=delegate, chunk_size=10) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual(delegate.encodings_attempted[0], delegate.successful_encoding) self.assertEqual(1, len(delegate.dataframes)) encoding, df = delegate.dataframes[0] self.assertEqual(encoding, delegate.successful_encoding) self.assertEqual(0, df.shape[0]) # No rows self.assertEqual(7, df.shape[1]) # 7 columns self.assertEqual(0, delegate.decode_errors) self.assertEqual(0, delegate.exceptions)
def setUp(self) -> None: self.gcs_factory_patcher = mock.patch( "recidiviz.admin_panel.dataset_metadata_store.GcsfsFactory.build") fake_gcs = FakeGCSFileSystem() fake_gcs.upload_from_string( path=GcsfsFilePath.from_absolute_path( "gs://recidiviz-456-configs/cloud_sql_to_bq_config.yaml"), contents=""" region_codes_to_exclude: - US_ND state_history_tables_to_include: - state_person_history county_columns_to_exclude: person: - full_name - birthdate_inferred_from_age """, content_type="text/yaml", ) fake_gcs.upload_from_string( path=GcsfsFilePath.from_absolute_path( "gs://recidiviz-456-ingest-metadata/ingest_metadata_latest_ingested_upper_bounds.json" ), contents=""" {"state_code":"US_PA","processed_date":"2020-11-25"} {"state_code":"US_ID","processed_date":"2021-01-04"} {"state_code":"US_MO","processed_date":"2020-12-21"} {"state_code":"US_ND","processed_date":"2020-12-16"} """, content_type="text/text", ) fixture_folder = os.path.join( os.path.dirname(os.path.realpath(__file__)), "fixtures", ) self.table_column_map: Dict[str, List[str]] = defaultdict(list) for f in os.listdir(fixture_folder): _, table, col = f.split("__") self.table_column_map[table].append(col[:-len(".json")]) path = GcsfsFilePath.from_absolute_path( f"gs://recidiviz-456-ingest-metadata/{f}") fake_gcs.test_add_path(path, local_path=os.path.join(fixture_folder, f)) self.gcs_factory_patcher.start().return_value = fake_gcs self.store = DatasetMetadataCountsStore( dataset_nickname="ingest", metadata_file_prefix="ingest_state_metadata", override_project_id="recidiviz-456", ) self.store.recalculate_store()
def _retrieve_data_for_po_monthly_report( state_code: StateCode, batch_id: str ) -> List[Recipient]: """Retrieves the data if the report type is POMonthlyReport.""" data_bucket = utils.get_data_storage_bucket_name() data_filename = "" gcs_file_system = GcsfsFactory.build() try: data_filename = utils.get_data_filename(state_code, ReportType.POMonthlyReport) path = GcsfsFilePath.from_absolute_path(f"gs://{data_bucket}/{data_filename}") file_contents = gcs_file_system.download_as_string(path) except BaseException: logging.info("Unable to load data file %s/%s", data_bucket, data_filename) raise archive_bucket = utils.get_data_archive_bucket_name() archive_filename = "" try: archive_filename = utils.get_data_archive_filename(batch_id, state_code) archive_path = GcsfsFilePath.from_absolute_path( f"gs://{archive_bucket}/{archive_filename}" ) gcs_file_system.upload_from_string( path=archive_path, contents=file_contents, content_type="text/json" ) except Exception: logging.error( "Unable to archive the data file to %s/%s", archive_bucket, archive_filename ) raise json_list = file_contents.splitlines() recipient_data: List[dict] = [] for json_str in json_list: try: item = json.loads(json_str) except Exception as err: logging.error( "Unable to parse JSON found in the file %s. Offending json string is: '%s'. <%s> %s", data_filename, json_str, type(err).__name__, err, ) else: if email := item.get("email_address"): mismatches = _get_mismatch_data_for_officer(email) if mismatches is not None: item["mismatches"] = mismatches recipient_data.append(item)
def test_metadata_added(self) -> None: """Tests that the metadata.json file is correctly added.""" with open( os.path.join( f"{os.path.dirname(__file__)}/context/po_monthly_report", FIXTURE_FILE ) ) as fixture_file: # Remove newlines self._write_test_data(json.dumps(json.loads(fixture_file.read()))) result = start( batch_id="fake-batch-id", state_code=StateCode.US_ID, report_type=ReportType.POMonthlyReport, region_code="US_ID_D3", ) self.assertEqual(len(result.successes), 1) # Test that metadata file is created correctly metadata_file = self.gcs_file_system.download_as_string( GcsfsFilePath.from_absolute_path( "gs://recidiviz-test-report-html/US_ID/fake-batch-id/metadata.json" ) ) self.assertEqual( json.loads(metadata_file), { "report_type": ReportType.POMonthlyReport.value, "review_month": "5", "review_year": "2021", }, ) # Try again for Top Opps email result = start( batch_id="fake-batch-id-2", state_code=StateCode.US_ID, report_type=ReportType.TopOpportunities, region_code="US_ID_D3", ) metadata_file = self.gcs_file_system.download_as_string( GcsfsFilePath.from_absolute_path( "gs://recidiviz-test-report-html/US_ID/fake-batch-id-2/metadata.json" ) ) self.assertEqual( json.loads(metadata_file), {"report_type": ReportType.TopOpportunities.value}, )
def mock_import_raw_file_to_big_query( self, *, source_uri: str, destination_table_schema: List[bigquery.SchemaField], **_kwargs: Any, ) -> mock.MagicMock: col_names = [ schema_field.name for schema_field in destination_table_schema ] temp_path = GcsfsFilePath.from_absolute_path(source_uri) local_temp_path = self.fs.gcs_file_system.real_absolute_path_for_path( temp_path) df = pd.read_csv(local_temp_path, header=None, dtype=str) for value in df.values: for cell in value: if isinstance(cell, str): stripped_cell = cell.strip() if stripped_cell != cell: raise ValueError( "Did not strip white space from raw data cell") if cell in col_names: raise ValueError( f"Wrote column row to output file: {value}") self.num_lines_uploaded += len(df) return mock.MagicMock()
def _run_gcs_imports() -> Tuple[str, HTTPStatus]: """Exposes an endpoint to trigger standard GCS imports.""" body = get_cloud_task_json_body() filename = body.get("filename") if not filename: return "Must include `filename` in the json payload", HTTPStatus.BAD_REQUEST for builder in CASE_TRIAGE_EXPORTED_VIEW_BUILDERS: if f"{builder.view_id}.csv" != filename: continue csv_path = GcsfsFilePath.from_absolute_path( os.path.join( CASE_TRIAGE_VIEWS_OUTPUT_DIRECTORY_URI.format( project_id=metadata.project_id()), filename, )) import_gcs_csv_to_cloud_sql( SchemaType.CASE_TRIAGE, builder.view_id, csv_path, builder.columns, seconds_to_wait=180, ) logging.info("View (%s) successfully imported", builder.view_id) return "", HTTPStatus.OK
def __init__(self) -> None: prefix = "" if not in_gcp() else f"{project_id()}-" self.allowlist_path = GcsfsFilePath.from_absolute_path( f"{prefix}case-triage-data/allowlist_v2.json" ) self.allowed_users: List[str] = [] self.admin_users: List[str] = []
def setUp(self) -> None: self.redis = fakeredis.FakeRedis() path = GcsfsFilePath.from_absolute_path( "gs://test_bucket/us_id/ingest_view/2021/01/11/unprocessed_2021-01-11T00:00:00:000000_ingest_view_test_view.csv" ) self.cache = mock.MagicMock() self.delegate = CacheIngestFileAsParquetDelegate(self.cache, path)
def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]: """Returns the appropriate paths to upload and the proper associated timestamp that it is to be normalized with. Skips any files that are not properly supported.""" path_candidates = [] for path, timestamp in self.paths_with_timestamps: if self.gcsfs.is_dir(path): directory = GcsfsDirectoryPath.from_absolute_path(path) files_in_directory = self.gcsfs.ls_with_blob_prefix( bucket_name=directory.bucket_name, blob_prefix=directory.relative_path, ) for file in files_in_directory: if self._is_supported_extension(file.abs_path()): path_candidates.append((file.abs_path(), timestamp)) else: self.skipped_files.append(file.abs_path()) elif self.gcsfs.is_file(path): file = GcsfsFilePath.from_absolute_path(path) if self._is_supported_extension(file.abs_path()): path_candidates.append((file.abs_path(), timestamp)) else: self.skipped_files.append(file.abs_path()) else: logging.warning( "Could not indicate %s as a directory or a file in %s. Skipping", path, self.destination_ingest_bucket.uri(), ) self.unable_to_upload_files.append(path) continue return path_candidates
def _to_normalized_file_path_from_normalized_path( original_normalized_file_path: str, build_function: Callable, file_type_override: Optional[GcsfsDirectIngestFileType] = None, ) -> str: """Moves any normalized path back to a unprocessed/processed path with the same information embedded in the file name. If |file_type_override| is provided, we will always overwrite the original path file type with the override file type.""" directory, _ = os.path.split(original_normalized_file_path) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path(original_normalized_file_path)) file_type = file_type_override if file_type_override else parts.file_type utc_iso_timestamp_str = parts.utc_upload_datetime.strftime( "%Y-%m-%dT%H:%M:%S:%f") suffix_str = f"_{parts.filename_suffix}" if parts.filename_suffix else "" base_file_name = f"{parts.file_tag}{suffix_str}" path_to_return = build_function( utc_iso_timestamp_str=utc_iso_timestamp_str, file_type=file_type, base_file_name=base_file_name, extension=parts.extension, ) return os.path.join(directory, path_to_return)
def set_config_yaml(self, contents: str) -> None: path = GcsfsFilePath.from_absolute_path( f"gs://{self.mock_project_id}-configs/cloud_sql_to_bq_config.yaml" ) self.fake_gcs.upload_from_string( path=path, contents=contents, content_type="text/yaml" )
def setUp(self) -> None: self.user_1_email = "*****@*****.**" self.mock_instance_id = "mock_instance_id" self.cloud_sql_client_patcher = patch( "recidiviz.cloud_sql.gcs_import_to_cloud_sql.CloudSQLClientImpl") self.mock_cloud_sql_client = MagicMock() self.cloud_sql_client_patcher.start( ).return_value = self.mock_cloud_sql_client self.mock_sqlalchemy_engine_manager = SQLAlchemyEngineManager setattr( self.mock_sqlalchemy_engine_manager, "get_stripped_cloudsql_instance_id", Mock(return_value=self.mock_instance_id), ) self.database_key = SQLAlchemyDatabaseKey.for_schema( SchemaType.CASE_TRIAGE) local_postgres_helpers.use_on_disk_postgresql_database( self.database_key) self.table_name = DashboardUserRestrictions.__tablename__ self.columns = [ col.name for col in DashboardUserRestrictions.__table__.columns ] self.gcs_uri = GcsfsFilePath.from_absolute_path( "US_MO/dashboard_user_restrictions.csv")
def test_read_with_exception(self) -> None: class _TestException(ValueError): pass class _ExceptionDelegate(_TestGcsfsCsvReaderDelegate): def on_dataframe(self, encoding: str, chunk_num: int, df: pd.DataFrame) -> bool: should_continue = super().on_dataframe(encoding, chunk_num, df) if chunk_num > 0: raise _TestException("We crashed processing!") return should_continue file_path = fixtures.as_filepath("encoded_utf_8.csv") delegate = _ExceptionDelegate() with self.assertRaises(_TestException): self.reader.streaming_read( GcsfsFilePath.from_absolute_path(file_path), delegate=delegate, chunk_size=1, ) self.assertEqual(1, len(delegate.encodings_attempted)) self.assertEqual("UTF-8", delegate.encodings_attempted[0]) self.assertIsNone(delegate.successful_encoding) self.assertEqual(2, len(delegate.dataframes)) self.assertEqual({"UTF-8"}, {encoding for encoding, df in delegate.dataframes}) self.assertEqual(0, delegate.decode_errors) self.assertEqual(1, delegate.exceptions)
def test_raw_data_import(self, mock_supported, mock_region, mock_environment): mock_supported.return_value = ['us_xx'] region_code = 'us_xx' mock_environment.return_value = 'staging' mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment='staging', ingestor=mock_controller) import_args = GcsfsRawDataBQImportArgs( raw_data_file_path=GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path( 'bucket/raw_data_path.csv', file_type=GcsfsDirectIngestFileType.RAW_DATA))) request_args = { 'region': region_code, } body = { 'cloud_task_args': import_args.to_serializable(), 'args_type': 'GcsfsRawDataBQImportArgs', } body_encoded = json.dumps(body).encode() headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.post('/raw_data_import', query_string=request_args, headers=headers, data=body_encoded) self.assertEqual(200, response.status_code) mock_controller.do_raw_data_import.assert_called_with(import_args)
def test_handle_file_start_ingest_unsupported_region( self, mock_region, mock_environment): region_code = 'us_nd' mock_environment.return_value = 'production' mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment='staging', ingestor=mock_controller) path = GcsfsFilePath.from_absolute_path( 'bucket-us-nd/elite_offenders.csv') request_args = { 'region': region_code, 'bucket': path.bucket_name, 'relative_file_path': path.blob_name, 'start_ingest': 'False', } headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.get('/handle_direct_ingest_file', query_string=request_args, headers=headers) mock_region.assert_called_with('us_nd', is_direct_ingest=True) mock_controller.handle_file.assert_called_with(path, False) # Even though the region isn't supported, we don't crash - the # controller handles not starting ingest, and if it does by accident, # the actual schedule/process_job endpoints handle the unlaunched # region check. self.assertEqual(200, response.status_code)
def test_handle_file_start_ingest(self, mock_region, mock_environment): region_code = 'us_nd' mock_environment.return_value = 'production' mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment='production', ingestor=mock_controller) path = GcsfsFilePath.from_absolute_path( 'bucket-us-nd/elite_offenders.csv') request_args = { 'region': region_code, 'bucket': path.bucket_name, 'relative_file_path': path.blob_name, 'start_ingest': 'True', } headers = {'X-Appengine-Cron': 'test-cron'} response = self.client.get('/handle_direct_ingest_file', query_string=request_args, headers=headers) mock_controller.handle_file.assert_called_with(path, True) # Even though the region isn't supported, we don't crash self.assertEqual(200, response.status_code)
def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]: """Returns the appropriate paths to upload and the proper associated timestamp that it is to be normalized with. Skips any files that are not properly supported.""" path_candidates = [] for path, timestamp in self.paths_with_timestamps: if self.gcsfs.is_dir(path): directory = GcsfsDirectoryPath.from_absolute_path(path) files_in_directory = self.gcsfs.ls_with_blob_prefix( bucket_name=directory.bucket_name, blob_prefix=directory.relative_path, ) for file in files_in_directory: path_candidates.append((file.abs_path(), timestamp)) elif self.gcsfs.is_file(path): file = GcsfsFilePath.from_absolute_path(path) path_candidates.append((file.abs_path(), timestamp)) else: logging.warning( "Could not indicate %s as a directory or a file in %s. Skipping", path, self.gcs_destination_path.uri(), ) self.unable_to_upload_files.append(path) continue result = [] for path, timestamp in path_candidates: _, ext = os.path.splitext(path) if not ext or ext not in self.SUPPORTED_EXTENSIONS: logging.info("Skipping file [%s] - invalid extension %s", path, ext) continue result.append((path, timestamp)) return result
def test_handle_file_start_ingest_unsupported_region( self, mock_region: mock.MagicMock, mock_environment: mock.MagicMock) -> None: region_code = "us_nd" mock_environment.return_value = "production" mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment="staging", ingestor=mock_controller) path = GcsfsFilePath.from_absolute_path( "bucket-us-nd/elite_offenders.csv") request_args = { "region": region_code, "bucket": path.bucket_name, "relative_file_path": path.blob_name, "start_ingest": "False", } headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get("/handle_direct_ingest_file", query_string=request_args, headers=headers) mock_region.assert_called_with("us_nd", is_direct_ingest=True) mock_controller.handle_file.assert_called_with(path, False) # Even though the region isn't supported, we don't crash - the # controller handles not starting ingest, and if it does by accident, # the actual schedule/process_job endpoints handle the unlaunched # region check. self.assertEqual(200, response.status_code)
def test_retrieve_data_po_monthly_report(self) -> None: batch_id = "123" test_data = "\n".join( [ build_report_json_fixture("*****@*****.**"), "my invalid json", build_report_json_fixture("*****@*****.**"), ] ) self._write_test_data(test_data) recipients = retrieve_data( state_code=self.state_code, report_type=ReportType.POMonthlyReport, batch_id=batch_id, ) # Invalid JSON lines are ignored; warnings are logged self.assertEqual(len(recipients), 2) self.assertEqual(recipients[0].email_address, "*****@*****.**") self.assertEqual(recipients[1].email_address, "*****@*****.**") # An archive of report JSON is stored self.assertEqual( self.gcs_file_system.download_as_string( GcsfsFilePath.from_absolute_path( f"gs://recidiviz-test-report-data-archive/{self.state_code.value}/123.json" ) ), test_data, )
def test_normalize_file_path(self, mock_fs_factory: mock.MagicMock, mock_environment: mock.MagicMock) -> None: mock_environment.return_value = "production" mock_fs = FakeGCSFileSystem() mock_fs_factory.return_value = mock_fs path = GcsfsFilePath.from_absolute_path("bucket-us-xx/file-tag.csv") mock_fs.test_add_path(path, local_path=None) request_args = { "bucket": path.bucket_name, "relative_file_path": path.blob_name, } headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get("/normalize_raw_file_path", query_string=request_args, headers=headers) self.assertEqual(200, response.status_code) self.assertEqual(1, len(mock_fs.all_paths)) registered_path = mock_fs.all_paths[0] if not isinstance(registered_path, GcsfsFilePath): self.fail(f"Unexpected type for path [{type(registered_path)}]") self.assertTrue( DirectIngestGCSFileSystem.is_normalized_file_path(registered_path))
def mv_path_to_normalized_path( self, path: GcsfsFilePath, file_type: GcsfsDirectIngestFileType, dt: Optional[datetime.datetime] = None, ) -> GcsfsFilePath: """Renames a file with an unnormalized file name to a file with a normalized file name in the same directory. If |dt| is specified, the file will contain that timestamp, otherwise will contain the current timestamp. Returns the new normalized path location of this file after the move completes. """ updated_file_path = GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path(path.abs_path(), file_type, dt)) if self.exists(updated_file_path): raise ValueError(f"Desired path [{updated_file_path.abs_path()}] " f"already exists, returning") logging.info( "Moving [%s] to normalized path [%s].", path.abs_path(), updated_file_path.abs_path(), ) self.mv(path, updated_file_path) return updated_file_path
def collect_file_paths( data_discovery_args: DataDiscoveryArgs, configs: ConfigsByFileType, gcs_files: List[str], ) -> FilesByFileType: """ Given a set of configs configs, filter the listed GCS files to only those that match our search filters """ collected_files = defaultdict(list) for found_file in gcs_files: try: path = GcsfsFilePath.from_absolute_path(found_file) file_parts = filename_parts_from_path(path) except DirectIngestError as e: if e.error_type == DirectIngestErrorType.INPUT_ERROR: continue logger.exception(e) continue if (not data_discovery_args.start_date <= file_parts.utc_upload_datetime.date() <= data_discovery_args.end_date): continue if file_parts.is_file_split: continue if file_parts.file_tag in configs[file_parts.file_type]: collected_files[file_parts.file_type].append(path) return collected_files
def setUp(self) -> None: self.metadata_patcher = mock.patch( "recidiviz.utils.metadata.project_id") self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = "recidiviz-staging" test_secrets = { # pylint: disable=protected-access SQLAlchemyEngineManager._get_cloudsql_instance_id_key(schema_type): f"test-project:us-east2:{schema_type.value}-data" for schema_type in SchemaType } self.get_secret_patcher = mock.patch( "recidiviz.utils.secrets.get_secret") self.get_secret_patcher.start().side_effect = test_secrets.get self.gcs_factory_patcher = mock.patch( "recidiviz.admin_panel.dataset_metadata_store.GcsfsFactory.build") self.fake_fs = FakeGCSFileSystem() self.gcs_factory_patcher.start().return_value = self.fake_fs self.fake_config_path = GcsfsFilePath.from_absolute_path( "gs://recidiviz-staging-configs/cloud_sql_to_bq_config.yaml")
def test_handle_file_start_ingest( self, mock_region: mock.MagicMock, mock_environment: mock.MagicMock) -> None: region_code = "us_nd" mock_environment.return_value = "production" mock_controller = create_autospec(GcsfsDirectIngestController) mock_region.return_value = fake_region(region_code=region_code, environment="production", ingestor=mock_controller) path = GcsfsFilePath.from_absolute_path( "bucket-us-nd/elite_offenders.csv") request_args = { "region": region_code, "bucket": path.bucket_name, "relative_file_path": path.blob_name, "start_ingest": "True", } headers = {"X-Appengine-Cron": "test-cron"} response = self.client.get("/handle_direct_ingest_file", query_string=request_args, headers=headers) mock_controller.handle_file.assert_called_with(path, True) # Even though the region isn't supported, we don't crash self.assertEqual(200, response.status_code)
def do_raw_data_import(self, data_import_args: GcsfsRawDataBQImportArgs) -> None: """Process a raw incoming file by importing it to BQ, tracking it in our metadata tables, and moving it to storage on completion. """ check_is_region_launched_in_env(self.region) if not self.region.are_raw_data_bq_imports_enabled_in_env(): raise ValueError( f"Raw data imports not enabled for region [{self.region.region_code}]" ) if not self.fs.exists(data_import_args.raw_data_file_path): logging.warning( "File path [%s] no longer exists - might have already been " "processed or deleted", data_import_args.raw_data_file_path, ) self.kick_scheduler(just_finished_job=True) return file_metadata = self.file_metadata_manager.get_file_metadata( data_import_args.raw_data_file_path) if file_metadata.processed_time: logging.warning( "File [%s] is already marked as processed. Skipping file processing.", data_import_args.raw_data_file_path.file_name, ) self.kick_scheduler(just_finished_job=True) return self.raw_file_import_manager.import_raw_file_to_big_query( data_import_args.raw_data_file_path, file_metadata) if not self.region.are_ingest_view_exports_enabled_in_env(): # TODO(#3162) This is a stopgap measure for regions that have only partially launched. Delete once SQL # pre-processing is enabled for all direct ingest regions. parts = filename_parts_from_path( data_import_args.raw_data_file_path) ingest_file_tags = self.get_file_tag_rank_list() if parts.file_tag in ingest_file_tags: self.fs.copy( data_import_args.raw_data_file_path, GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( data_import_args.raw_data_file_path.abs_path(), file_type_override=GcsfsDirectIngestFileType. INGEST_VIEW, )), ) processed_path = self.fs.mv_path_to_processed_path( data_import_args.raw_data_file_path) self.file_metadata_manager.mark_file_as_processed( path=data_import_args.raw_data_file_path) self.fs.mv_path_to_storage(processed_path, self.storage_directory_path) self.kick_scheduler(just_finished_job=True)