def gcsfs_direct_ingest_storage_directory_path_for_region( *, region_code: str, system_level: SystemLevel, ingest_instance: DirectIngestInstance, file_type: Optional[GcsfsDirectIngestFileType] = None, project_id: Optional[str] = None, ) -> GcsfsDirectoryPath: if project_id is None: project_id = metadata.project_id() if not project_id: raise ValueError("Project id not set") suffix = bucket_suffix_for_ingest_instance(ingest_instance) bucket_name = build_ingest_storage_bucket_name( project_id=project_id, system_level_str=system_level.value.lower(), suffix=suffix, ) storage_bucket = GcsfsBucketPath(bucket_name) if file_type is not None: subdir = os.path.join(region_code.lower(), file_type.value) else: subdir = region_code.lower() return GcsfsDirectoryPath.from_dir_and_subdir(storage_bucket, subdir)
def scheduler() -> Tuple[str, HTTPStatus]: """Checks the state of the ingest instance and schedules any tasks to be run.""" logging.info("Received request for direct ingest scheduler: %s", request.values) region_code = get_str_param_value("region", request.values) just_finished_job = get_bool_param_value("just_finished_job", request.values, default=False) # The bucket name for ingest instance to schedule work out of bucket = get_str_param_value("bucket", request.args) if not region_code or just_finished_job is None or not bucket: response = f"Bad parameters [{request.values}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST bucket_path = GcsfsBucketPath(bucket) with monitoring.push_region_tag( region_code, ingest_instance=DirectIngestInstance.for_ingest_bucket( bucket_path).value, ): try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=bucket_path, allow_unlaunched=False) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e controller.schedule_next_ingest_job(just_finished_job) return "", HTTPStatus.OK
def _get_bucket_metadata(self, path: GcsfsBucketPath) -> BucketSummaryType: """Returns a dictionary containing the following info for a given bucket: i.e. { name: bucket_name, unprocessedFilesRaw: how many unprocessed raw data files in the bucket, processedFilesRaw: how many processed raw data files are in the bucket (should be zero), unprocessedFilesIngestView: how many unprocessed ingest view files in the bucket, processedFilesIngestView: how many processed ingest view files are in the bucket } """ bucket_metadata: BucketSummaryType = { "name": path.abs_path(), } for file_type in GcsfsDirectIngestFileType: file_type_str = self.get_file_type_api_string(file_type) unprocessed_files = self.fs.get_unprocessed_file_paths( path, file_type) bucket_metadata[f"unprocessedFiles{file_type_str}"] = len( unprocessed_files) processed_files = self.fs.get_processed_file_paths(path, file_type) bucket_metadata[f"processedFiles{file_type_str}"] = len( processed_files) return bucket_metadata
def test_gcsfs_sftp_download_directory_path_fails_for_county(self) -> None: sftp_download_bucket = gcsfs_sftp_download_bucket_path_for_region( "us_xx_yyyy", SystemLevel.COUNTY ) self.assertEqual( sftp_download_bucket, GcsfsBucketPath("recidiviz-staging-direct-ingest-county-us-xx-yyyy-sftp"), )
def test_retry(self): mock_bucket = create_autospec(Bucket) mock_bucket.exists.return_value = True # Client first raises a Gateway timeout, then returns a normal bucket. self.mock_storage_client.get_bucket.side_effect = [ exceptions.GatewayTimeout('Exception'), mock_bucket ] # Should not crash! self.assertTrue( self.fs.exists( GcsfsBucketPath.from_absolute_path('gs://my-bucket')))
def test_retry_with_fatal_error(self) -> None: mock_bucket = create_autospec(Bucket) mock_bucket.exists.return_value = True # Client first raises a Gateway timeout, then on retry will raise a ValueError self.mock_storage_client.get_bucket.side_effect = [ exceptions.GatewayTimeout("Exception"), ValueError("This will crash"), ] with self.assertRaises(ValueError): self.fs.exists( GcsfsBucketPath.from_absolute_path("gs://my-bucket"))
def gcsfs_sftp_download_bucket_path_for_region( region_code: str, system_level: SystemLevel, project_id: Optional[str] = None ) -> GcsfsBucketPath: """Returns the GCS Directory Path for the bucket that will hold the SFTP downloaded files.""" if project_id is None: project_id = metadata.project_id() if not project_id: raise ValueError("Project id not set") bucket_name = build_ingest_bucket_name( project_id=project_id, region_code=region_code, system_level_str=system_level.value.lower(), suffix=INGEST_SFTP_BUCKET_SUFFIX, ) return GcsfsBucketPath(bucket_name)
def handle_direct_ingest_file() -> Tuple[str, HTTPStatus]: """Called from a Cloud Function when a new file is added to a direct ingest bucket. Will trigger a job that deals with normalizing and splitting the file as is appropriate, then start the scheduler if allowed. """ region_code = get_str_param_value("region", request.args) # The bucket name for the file to ingest bucket = get_str_param_value("bucket", request.args) # The relative path to the file, not including the bucket name relative_file_path = get_str_param_value("relative_file_path", request.args, preserve_case=True) start_ingest = get_bool_param_value("start_ingest", request.args, default=False) if not region_code or not bucket or not relative_file_path or start_ingest is None: response = f"Bad parameters [{request.args}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST bucket_path = GcsfsBucketPath(bucket_name=bucket) with monitoring.push_region_tag( region_code, ingest_instance=DirectIngestInstance.for_ingest_bucket( bucket_path).value, ): try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=bucket_path, allow_unlaunched=True, ) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e path = GcsfsPath.from_bucket_and_blob_name( bucket_name=bucket, blob_name=relative_file_path) if isinstance(path, GcsfsFilePath): controller.handle_file(path, start_ingest=start_ingest) return "", HTTPStatus.OK
def _generate_output_path( self, ingest_view_export_args: GcsfsIngestViewExportArgs, metadata: DirectIngestIngestFileMetadata, ) -> GcsfsFilePath: ingest_view = self.ingest_views_by_tag[ingest_view_export_args.ingest_view_name] if not metadata.normalized_file_name: output_file_name = to_normalized_unprocessed_file_name( f"{ingest_view.file_tag}.csv", GcsfsDirectIngestFileType.INGEST_VIEW, dt=ingest_view_export_args.upper_bound_datetime_to_export, ) else: output_file_name = metadata.normalized_file_name return GcsfsFilePath.from_directory_and_file_name( GcsfsBucketPath(ingest_view_export_args.output_bucket_name), output_file_name, )
def setUp(self) -> None: self.project_id = "recidiviz-456" self.project_id_patcher = patch("recidiviz.utils.metadata.project_id") self.project_id_patcher.start().return_value = self.project_id self.test_region = fake_region(region_code="us_xx", region_module=fake_regions_module) self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem()) self.ingest_bucket_path = GcsfsBucketPath( bucket_name="my_ingest_bucket") self.temp_output_path = GcsfsDirectoryPath(bucket_name="temp_bucket") self.region_raw_file_config = DirectIngestRegionRawFileConfig( region_code="us_xx", region_module=fake_regions_module) self.mock_big_query_client = create_autospec(BigQueryClient) self.num_lines_uploaded = 0 self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = ( self.mock_import_raw_file_to_big_query) self.import_manager = DirectIngestRawFileImportManager( region=self.test_region, fs=self.fs, ingest_bucket_path=self.ingest_bucket_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client, ) self.import_manager.csv_reader = GcsfsCsvReader( self.fs.gcs_file_system) self.time_patcher = patch( "recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time" ) self.mock_time = self.time_patcher.start() def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference: return bigquery.DatasetReference(project=self.project_id, dataset_id=dataset_id) self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
def gcsfs_direct_ingest_bucket_for_region( *, region_code: str, system_level: SystemLevel, ingest_instance: DirectIngestInstance, project_id: Optional[str] = None, ) -> GcsfsBucketPath: if project_id is None: project_id = metadata.project_id() if not project_id: raise ValueError("Project id not set") suffix = bucket_suffix_for_ingest_instance(ingest_instance) bucket_name = build_ingest_bucket_name( project_id=project_id, region_code=region_code, system_level_str=system_level.value.lower(), suffix=suffix, ) return GcsfsBucketPath(bucket_name=bucket_name)
def test_create_direct_ingest_scheduler_queue_task_secondary( self, mock_client: mock.MagicMock, mock_uuid: mock.MagicMock) -> None: # Arrange body_encoded = json.dumps({}).encode() uuid = "random-uuid" mock_uuid.uuid4.return_value = uuid queue_path = "us_xx-scheduler-queue-path" queue_name = "direct-ingest-state-us-xx-scheduler" task_name = "{}/{}-{}-{}".format(queue_name, _REGION.region_code, "2019-07-20", uuid) task = tasks_v2.types.task_pb2.Task( name=task_name, app_engine_http_request={ "http_method": "POST", "relative_uri": f"/direct/scheduler?region={_REGION.region_code}&" f"bucket=some-bucket&just_finished_job=False", "body": body_encoded, }, ) mock_client.return_value.task_path.return_value = task_name mock_client.return_value.queue_path.return_value = queue_path # Act DirectIngestCloudTaskManagerImpl( ).create_direct_ingest_scheduler_queue_task( region=_REGION, ingest_instance=DirectIngestInstance.SECONDARY, ingest_bucket=GcsfsBucketPath("some-bucket"), just_finished_job=False, ) # Assert mock_client.return_value.queue_path.assert_called_with( self.mock_project_id, QUEUES_REGION, queue_name) mock_client.return_value.create_task.assert_called_with( parent=queue_path, task=task)
def handle_new_files() -> Tuple[str, HTTPStatus]: """Normalizes and splits files in the ingest bucket for a given region as is appropriate. Will schedule the next process_job task if no renaming / splitting work has been done that will trigger subsequent calls to this endpoint. """ logging.info("Received request for direct ingest handle_new_files: %s", request.values) region_code = get_str_param_value("region", request.values) can_start_ingest = get_bool_param_value("can_start_ingest", request.values, default=False) bucket = get_str_param_value("bucket", request.values) if not region_code or can_start_ingest is None or not bucket: response = f"Bad parameters [{request.values}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST bucket_path = GcsfsBucketPath(bucket_name=bucket) with monitoring.push_region_tag( region_code, ingest_instance=DirectIngestInstance.for_ingest_bucket( bucket_path).value, ): try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=bucket_path, allow_unlaunched=True, ) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e controller.handle_new_files(can_start_ingest=can_start_ingest) return "", HTTPStatus.OK
def test_gcsfs_sftp_download_directory_path_for_region(self) -> None: self.assertEqual( gcsfs_sftp_download_bucket_path_for_region("us_nd", SystemLevel.STATE), GcsfsBucketPath("recidiviz-staging-direct-ingest-state-us-nd-sftp"), )
def upload_from_sftp() -> Tuple[str, HTTPStatus]: """Connects to remote SFTP servers and uploads the files in both raw and normalized form to GCS buckets to start the ingest process. Should only be called from a task queue scheduler. Args: region_code (Optional[str]): required as part of the request to identify the region date_str (Optional[str]): ISO format date string, used to determine the lower bound date in which to start pulling items from the SFTP server. If None, uses yesterday as the default lower bound time, otherwises creates a datetime from the string. bucket_str (Optional[str]): GCS bucket name, used to override the destination in which the SFTP assets are downloaded to and moved for proper ingest (therefore used in both controllers). If None, uses the bucket determined by |region_code| otherwise, uses this destination. """ logging.info("Received request for uploading files from SFTP: %s", request.values) region_code = get_str_param_value("region", request.values) date_str = get_str_param_value("date", request.values) bucket_str = get_str_param_value("bucket", request.values) gcs_destination_path = GcsfsBucketPath(bucket_str) if bucket_str else None if not region_code: response = f"Bad parameters [{request.values}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code, ingest_instance=None): lower_bound_update_datetime = ( datetime.datetime.fromisoformat(date_str) if date_str is not None else datetime.datetime.now(tz=pytz.UTC) - datetime.timedelta(1)) sftp_controller = DownloadFilesFromSftpController( project_id=metadata.project_id(), region=region_code, lower_bound_update_datetime=lower_bound_update_datetime, gcs_destination_path=gcs_destination_path, ) download_result = sftp_controller.do_fetch() with monitoring.measurements({TagKey.SFTP_TASK_TYPE: "download"}) as download_measurements: download_measurements.measure_int_put( m_sftp_attempts, len(download_result.successes) + len(download_result.failures), ) download_measurements.measure_int_put( m_sftp_errors, len(download_result.failures)) unable_to_download_text = ( f"Unable to download the following files: {download_result.failures}" if download_result.failures else "") skipped_download_text = ( f"Skipped downloading the following files: {download_result.skipped}" if download_result.skipped else "") if not download_result.successes and download_result.failures: return ( f"All files failed to download. {unable_to_download_text}", HTTPStatus.BAD_REQUEST, ) if not download_result.successes and not download_result.skipped: return f"No items to download for {region_code}", HTTPStatus.BAD_REQUEST if not download_result.successes and download_result.skipped: return f"All files skipped. {skipped_download_text}", HTTPStatus.OK if not download_result.successes: raise ValueError("Expected non-empty successes here.") upload_controller = UploadStateFilesToIngestBucketController( paths_with_timestamps=download_result.successes, project_id=metadata.project_id(), region=region_code, gcs_destination_path=gcs_destination_path, ) upload_result = upload_controller.do_upload() with monitoring.measurements({TagKey.SFTP_TASK_TYPE: "upload"}) as upload_measurements: upload_measurements.measure_int_put( m_sftp_attempts, len(upload_result.successes) + len(upload_result.failures), ) upload_measurements.measure_int_put(m_sftp_errors, len(upload_result.failures)) sftp_controller.clean_up() unable_to_upload_text = ( f"Unable to upload the following files: {upload_result.failures}" if upload_result.failures else "") skipped_text = ( f"Skipped uploading the following files: {upload_controller.skipped_files}" if upload_result.skipped else "") if not upload_result.successes and not upload_result.skipped: return ( f"{unable_to_download_text}" f" All files failed to upload. {unable_to_upload_text}" f"{skipped_text}", HTTPStatus.BAD_REQUEST, ) if download_result.failures or upload_result.failures: return ( f"{unable_to_download_text}" f" {unable_to_upload_text}" f"{skipped_text}", HTTPStatus.MULTI_STATUS, ) if not upload_result.successes and upload_result.skipped: return f"All files skipped. {skipped_text}", HTTPStatus.OK if upload_result.skipped: return ( f"{unable_to_download_text}" f" {unable_to_upload_text}" f"{skipped_text}", HTTPStatus.MULTI_STATUS, ) # Trigger ingest to handle copied files (in case queue has emptied already while # ingest was paused). direct_ingest_cloud_task_manager = DirectIngestCloudTaskManagerImpl() direct_ingest_cloud_task_manager.create_direct_ingest_handle_new_files_task( region=_region_for_region_code(region_code), ingest_instance=DirectIngestInstance.PRIMARY, ingest_bucket=upload_controller.destination_ingest_bucket, can_start_ingest=True, ) return "", HTTPStatus.OK
def ingest_view_export() -> Tuple[str, HTTPStatus]: """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be processed and ingested into our Recidiviz DB. """ logging.info("Received request to do direct ingest view export: [%s]", request.values) region_code = get_str_param_value("region", request.values) output_bucket_name = get_str_param_value("output_bucket", request.values, preserve_case=True) if not region_code or not output_bucket_name: response = f"Bad parameters [{request.values}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST with monitoring.push_region_tag( region_code, ingest_instance=DirectIngestInstance.for_ingest_bucket( GcsfsBucketPath(output_bucket_name)).value, ): json_data = request.get_data(as_text=True) ingest_view_export_args = _parse_cloud_task_args(json_data) if not ingest_view_export_args: raise DirectIngestError( msg= "raw_data_import was called with no GcsfsIngestViewExportArgs.", error_type=DirectIngestErrorType.INPUT_ERROR, ) if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs): raise DirectIngestError( msg= f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].", error_type=DirectIngestErrorType.INPUT_ERROR, ) if output_bucket_name != ingest_view_export_args.output_bucket_name: raise DirectIngestError( msg= f"Different buckets were passed in the url and request body\n" f"url: {output_bucket_name}\n" f"body: {ingest_view_export_args.output_bucket_name}", error_type=DirectIngestErrorType.INPUT_ERROR, ) with monitoring.push_tags({ TagKey.INGEST_VIEW_EXPORT_TAG: ingest_view_export_args.task_id_tag() }): try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=GcsfsBucketPath( ingest_view_export_args.output_bucket_name), allow_unlaunched=False, ) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e controller.do_ingest_view_export(ingest_view_export_args) return "", HTTPStatus.OK
def main() -> None: """Executes the main flow of the script.""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "paths", metavar="PATH", nargs="+", help= "Path to files to move, either single file path or directory path.", ) parser.add_argument( "--project-id", required=True, help= "Which project the file(s) should be uploaded to (e.g. recidiviz-123).", ) parser.add_argument("--region", required=True, help="E.g. 'us_nd'") parser.add_argument("--date", required=True, help="The date to be associated with this file.") parser.add_argument( "--dry-run", type=str_to_bool, default=True, help="Whether or not to run this script in dry run (log only) mode.", ) parser.add_argument( "--destination-bucket", type=str, default=None, help= "Override destination bucket for the upload. Can be used to upload files " "to an arbitrary testing bucket with normalized names.", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO, format="%(message)s") override_bucket = (GcsfsBucketPath(args.destination_bucket) if args.destination_bucket else None) controller = ManualUploadStateFilesToIngestBucketController( paths=args.paths, project_id=args.project_id, region=args.region, date=args.date, dry_run=args.dry_run, destination_bucket_override=override_bucket, ) if controller.dry_run: logging.info("Running in DRY RUN mode for region [%s]", controller.region) else: i = input( f"This will upload raw files to the [{controller.region}] ingest bucket " f"[{controller.destination_ingest_bucket.uri()}] with datetime " f"[{args.date}]. Type {controller.project_id} to continue: ") if i != controller.project_id: return if override_bucket: if not controller.dry_run: i = input( f"Are you sure you want to upload to non-standard bucket " f"[{controller.destination_ingest_bucket.uri()}]?. Type " f"{controller.destination_ingest_bucket.bucket_name} to continue: " ) if i != controller.destination_ingest_bucket.bucket_name: return msg_prefix = "DRY_RUN: " if controller.dry_run else "" controller.move_progress = Bar(f"{msg_prefix}Moving files...", max=len(controller.get_paths_to_upload())) controller.do_upload() if not controller.move_progress: raise ValueError("Progress bar should not be None") controller.move_progress.finish() controller.write_copies_to_log_file() if controller.dry_run: logging.info( "DRY RUN: See results in [%s].\nRerun with [--dry-run False] to execute move.", controller.log_output_path, ) else: logging.info("Upload complete! See results in [%s].", controller.log_output_path)
) parser.add_argument( "--source_bucket", type=str, default=True, help="A sandbox GCS bucket where raw files live. Files in this bucket must " "already have normalized file names.", ) parser.add_argument( "--file_tag_filter", default=None, help="Regex file tag filter - when set, will only import files whose tags " "contain a match to this regex.", ) return parser.parse_known_args(argv) if __name__ == "__main__": logging.getLogger().setLevel(logging.INFO) known_args, _ = parse_arguments(sys.argv) with local_project_id_override(GCP_PROJECT_STAGING): do_upload( state_code=StateCode(known_args.state_code), sandbox_dataset_prefix=known_args.sandbox_dataset_prefix, source_bucket=GcsfsBucketPath(known_args.source_bucket), file_tag_filter=known_args.file_tag_filter, )