def dataflow_monitor() -> Tuple[str, HTTPStatus]: """Calls the dataflow monitor manager to begin monitoring a Dataflow job. Endpoint path parameters: job_id: The unique id of the job to monitor location: The region where the job is being run topic: The Pub/Sub topic to publish a message to if the job is successful """ job_id = get_str_param_value('job_id', request.args) location = get_str_param_value('location', request.args) topic = get_str_param_value('topic', request.args) if not job_id: raise ValueError('Unexpected empty job_id.') if not location: raise ValueError('Unexpected empty location.') if not topic: raise ValueError('Unexpected empty topic.') logging.info( "Attempting to monitor the job with id: %s. Will " "publish to %s on success.", job_id, topic) CalculateCloudTaskManager().create_dataflow_monitor_task( job_id, location, topic) return '', HTTPStatus.OK
def normalize_raw_file_path() -> Tuple[str, HTTPStatus]: """Called from a Cloud Function when a new file is added to a bucket that is configured to rename files but not ingest them. For example, a bucket that is being used for automatic data transfer testing. """ # The bucket name for the file to normalize bucket = get_str_param_value("bucket", request.args) # The relative path to the file, not including the bucket name relative_file_path = get_str_param_value("relative_file_path", request.args, preserve_case=True) if not bucket or not relative_file_path: return f"Bad parameters [{request.args}]", HTTPStatus.BAD_REQUEST path = GcsfsPath.from_bucket_and_blob_name(bucket_name=bucket, blob_name=relative_file_path) if not isinstance(path, GcsfsFilePath): raise ValueError( f"Incorrect type [{type(path)}] for path: {path.uri()}") fs = DirectIngestGCSFileSystem(GcsfsFactory.build()) fs.mv_path_to_normalized_path(path, file_type=GcsfsDirectIngestFileType.RAW_DATA) return "", HTTPStatus.OK
def handle_direct_ingest_file() -> Tuple[str, HTTPStatus]: """Called from a Cloud Function when a new file is added to a direct ingest bucket. Will trigger a job that deals with normalizing and splitting the file as is appropriate, then start the scheduler if allowed. """ region_code = get_str_param_value('region', request.args) # The bucket name for the file to ingest bucket = get_str_param_value('bucket', request.args) # The relative path to the file, not including the bucket name relative_file_path = get_str_param_value('relative_file_path', request.args, preserve_case=True) start_ingest = \ get_bool_param_value('start_ingest', request.args, default=False) if not region_code or not bucket \ or not relative_file_path or start_ingest is None: return f'Bad parameters [{request.args}]', HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code): controller = controller_for_region_code(region_code, allow_unlaunched=True) if not isinstance(controller, GcsfsDirectIngestController): raise DirectIngestError( msg=f"Unexpected controller type [{type(controller)}].", error_type=DirectIngestErrorType.INPUT_ERROR) path = GcsfsPath.from_bucket_and_blob_name( bucket_name=bucket, blob_name=relative_file_path) if isinstance(path, GcsfsFilePath): controller.handle_file(path, start_ingest=start_ingest) return '', HTTPStatus.OK
def scheduler() -> Tuple[str, HTTPStatus]: """Checks the state of the ingest instance and schedules any tasks to be run.""" logging.info("Received request for direct ingest scheduler: %s", request.values) region_code = get_str_param_value("region", request.values) just_finished_job = get_bool_param_value("just_finished_job", request.values, default=False) # The bucket name for ingest instance to schedule work out of bucket = get_str_param_value("bucket", request.args) if not region_code or just_finished_job is None or not bucket: response = f"Bad parameters [{request.values}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST bucket_path = GcsfsBucketPath(bucket) with monitoring.push_region_tag( region_code, ingest_instance=DirectIngestInstance.for_ingest_bucket( bucket_path).value, ): try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=bucket_path, allow_unlaunched=False) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e controller.schedule_next_ingest_job(just_finished_job) return "", HTTPStatus.OK
def deliver_emails_for_batch() -> Tuple[str, HTTPStatus]: """Deliver a batch of generated emails. Query parameters: batch_id: (required) Identifier for this batch redirect_address: (optional) An email address to which all emails will be sent. This can be used for redirecting all of the reports to a supervisor. Returns: Text indicating the results of the run and an HTTP status Raises: Nothing. Catch everything so that we can always return a response to the request """ batch_id = get_str_param_value('batch_id', request.args) redirect_address = get_str_param_value('redirect_address', request.args) if not batch_id: msg = "Query parameter 'batch_id' not received" logging.error(msg) return msg, HTTPStatus.BAD_REQUEST if redirect_address: success_count, failure_count = email_delivery.deliver(batch_id, redirect_address=redirect_address) return (f"Sent {success_count} emails to the test address {redirect_address}. " f"{failure_count} emails failed to send"), HTTPStatus.OK success_count, failure_count = email_delivery.deliver(batch_id) return f"Sent {success_count} emails. {failure_count} emails failed to send", HTTPStatus.OK
def start_new_batch() -> Tuple[str, HTTPStatus]: """Start a new batch of email generation for the indicated state. Query parameters: state_code: (required) A valid state code for which reporting is enabled (ex: "US_ID") report_type: (required) A valid report type identifier (ex: "po_monthly_report) test_address: (optional) Should only be used for testing. When provided, the test_address is used to generate the email filenames, ensuring that all emails in the batch can only be delivered to the test_address and not to the usual recipients of the report. The email filenames will include the original recipient's email username, for example: [email protected]. Returns: Text indicating the results of the run and an HTTP status Raises: Nothing. Catch everything so that we can always return a response to the request """ state_code = get_str_param_value('state_code', request.args) report_type = get_str_param_value('report_type', request.args) test_address = get_str_param_value('test_address', request.args) if not state_code or not report_type: msg = "Request does not include 'state_code' and 'report_type' parameters" logging.error(msg) return msg, HTTPStatus.BAD_REQUEST state_code = state_code.upper() batch_id = data_retrieval.start(state_code, report_type, test_address) test_address_text = f"Emails generated for test address: {test_address}" if test_address else "" return (f"New batch started for {state_code} and {report_type}. Batch " f"id = {batch_id}. {test_address_text}"), HTTPStatus.OK
def upload_from_sftp() -> Tuple[str, HTTPStatus]: """Connects to remote SFTP servers and uploads the files in both raw and normalized form to GCS buckets to start the ingest process. Should only be called from a task queue scheduler. Args: region_code (Optional[str]): required as part of the request to identify the region date_str (Optional[str]): ISO format date string, used to determine the lower bound date in which to start pulling items from the SFTP server. If None, uses yesterday as the default lower bound time, otherwises creates a datetime from the string. bucket_str (Optional[str]): GCS bucket name, used to override the destination in which the SFTP assets are downloaded to and moved for proper ingest (therefore used in both controllers). If None, uses the bucket determined by |region_code| otherwise, uses this destination. """ logging.info("Received request for uploading files from SFTP: %s", request.values) region_code = get_str_param_value("region", request.values) date_str = get_str_param_value("date", request.values) bucket_str = get_str_param_value("bucket", request.values) if not region_code: return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code): lower_bound_update_datetime = ( datetime.datetime.fromisoformat(date_str) if date_str is not None else datetime.datetime.utcnow() - datetime.timedelta(1)) sftp_controller = DownloadFilesFromSftpController( project_id=metadata.project_id(), region=region_code, lower_bound_update_datetime=lower_bound_update_datetime, gcs_destination_path=bucket_str, ) downloaded_items, unable_to_download_items = sftp_controller.do_fetch() if downloaded_items: _, unable_to_upload_files = UploadStateFilesToIngestBucketController( paths_with_timestamps=downloaded_items, project_id=metadata.project_id(), region=region_code, gcs_destination_path=bucket_str, ).do_upload() sftp_controller.clean_up() if unable_to_download_items or unable_to_upload_files: return ( f"Unable to download the following files: {unable_to_download_items}, " f"and upload the following files: {unable_to_upload_files}", HTTPStatus.MULTI_STATUS, ) elif unable_to_download_items: return ( f"Unable to download the following files {unable_to_download_items}", HTTPStatus.MULTI_STATUS, ) elif not downloaded_items and not unable_to_download_items: return f"No items to download for {region_code}", HTTPStatus.MULTI_STATUS return "", HTTPStatus.OK
def state_aggregate() -> Tuple[str, HTTPStatus]: """Calls state aggregates""" # Please add new states in alphabetical order state_to_parser = { 'california': ca_aggregate_ingest.parse, 'florida': fl_aggregate_ingest.parse, 'georgia': ga_aggregate_ingest.parse, 'hawaii': hi_aggregate_ingest.parse, 'kentucky': ky_aggregate_ingest.parse, 'new_york': ny_aggregate_ingest.parse, 'pennsylvania': pa_aggregate_ingest.parse, 'tennessee': tn_aggregate_ingest.parse, 'texas': tx_aggregate_ingest.parse, } bucket = get_str_param_value('bucket', request.args) state = get_str_param_value('state', request.args) filename = get_str_param_value('filename', request.args) project_id = metadata.project_id() logging.info("The project id is %s", project_id) if not bucket or not state or not filename: raise StateAggregateError( "All of state, bucket, and filename must be provided") path = os.path.join(bucket, state, filename) parser = state_to_parser[state] # Don't use the gcsfs cache fs = gcsfs.GCSFileSystem(project=project_id, cache_timeout=GCSFS_NO_CACHING) logging.info("The path to download from is %s", path) # TODO(#3292): Uncomment once gcsfs.ls is more stable # bucket_path = os.path.join(bucket, state) # logging.info("The files in the directory are:") # logging.info(fs.ls(bucket_path)) # Providing a stream buffer to tabula reader does not work because it # tries to load the file into the local filesystem, since appengine is a # read only filesystem (except for the tmpdir) we download the file into # the local tmpdir and pass that in. tmpdir_path = os.path.join(tempfile.gettempdir(), filename) fs.get(path, tmpdir_path) logging.info("Successfully downloaded file from gcs: %s", path) try: result = parser(os.path.join(bucket, state), tmpdir_path) logging.info('Successfully parsed the report') for table, df in result.items(): dao.write_df(table, df) # If we are successful, we want to move the file out of the cloud # function triggered directory, and into the historical path. historical_path = os.path.join(HISTORICAL_BUCKET.format(project_id), state, filename) fs.mv(path, historical_path) return '', HTTPStatus.OK except Exception as e: return jsonify(e), HTTPStatus.INTERNAL_SERVER_ERROR
def process_job() -> Tuple[str, HTTPStatus]: """Processes a single direct ingest file, specified in the provided ingest arguments. """ logging.info('Received request to process direct ingest job: [%s]', request.values) region_code = get_str_param_value('region', request.values) if not region_code: return f'Bad parameters [{request.values}]', HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code): json_data = request.get_data(as_text=True) ingest_args = _get_ingest_args(json_data) if not ingest_args: return f'Could not parse ingest args', HTTPStatus.BAD_REQUEST with monitoring.push_tags( {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}): try: if not ingest_args: raise DirectIngestError( msg=f"process_job was called with no IngestArgs.", error_type=DirectIngestErrorType.INPUT_ERROR) controller = controller_for_region_code(region_code) except DirectIngestError as e: if e.is_bad_request(): return str(e), HTTPStatus.BAD_REQUEST raise e controller.run_ingest_job_and_kick_scheduler_on_completion( ingest_args) return '', HTTPStatus.OK
def create_metric_view_data_export_tasks() -> Tuple[str, HTTPStatus]: """Queues a task to export data in BigQuery metric views to cloud storage buckets. Example: export/create_metric_view_data_export_tasks?export_job_filter=US_ID URL parameters: export_job_filter: Job name to initiate export for (e.g. US_ID or LANTERN). If state_code, will create tasks for all products that have launched for that state_code. If product name, will create tasks for all states that have launched for that product. Args: N/A Returns: N/A """ logging.info("Queueing a task to export view data to cloud storage") export_job_filter = get_str_param_value("export_job_filter", request.args) if not export_job_filter: return ( "Missing required export_job_filter URL parameter", HTTPStatus.BAD_REQUEST, ) relevant_product_exports = ProductConfigs.from_file( path=PRODUCTS_CONFIG_PATH).get_export_configs_for_job_filter( export_job_filter) for export in relevant_product_exports: ViewExportCloudTaskManager().create_metric_view_data_export_task( export_job_name=export["export_job_name"], state_code=export["state_code"]) return "", HTTPStatus.OK
def metric_view_data_export() -> Tuple[str, HTTPStatus]: """Exports data in BigQuery metric views to cloud storage buckets. Example: export/metric_view_data?export_job_filter=US_ID URL parameters: export_job_filter: (string) Kind of jobs to initiate export for. Can either be an export_name (e.g. LANTERN) or a state_code (e.g. US_ND) Args: N/A Returns: N/A """ logging.info("Attempting to export view data to cloud storage") export_job_filter = get_str_param_value("export_job_filter", request.args) if not export_job_filter: return ( "missing required export_job_filter URL parameter", HTTPStatus.BAD_REQUEST, ) export_view_data_to_cloud_storage(export_job_filter) return "", HTTPStatus.OK
def create_metric_view_data_export_task() -> Tuple[str, HTTPStatus]: """Queues a task to export data in BigQuery metric views to cloud storage buckets. Example: export/create_metric_view_data_export_task?export_job_filter=US_ID URL parameters: export_job_filter: (string) Kind of jobs to initiate export for. Can either be an export_name (e.g. LANTERN) or a state_code (e.g. US_ND) Args: N/A Returns: N/A """ logging.info("Queueing a task to export view data to cloud storage") export_job_filter = get_str_param_value("export_job_filter", request.args) if not export_job_filter: return ( "missing required export_job_filter URL parameter", HTTPStatus.BAD_REQUEST, ) ViewExportCloudTaskManager().create_metric_view_data_export_task( export_job_filter=export_job_filter) return "", HTTPStatus.OK
def handle_sftp_files() -> Tuple[str, HTTPStatus]: """Schedules the SFTP downloads into the appropriate cloud task queue.""" logging.info("Received request for handling SFTP files: %s", request.values) region_code = get_str_param_value("region", request.values) if not region_code: response = f"Bad parameters [{request.values}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code, ingest_instance=None): try: region = _region_for_region_code(region_code) direct_ingest_cloud_task_manager = DirectIngestCloudTaskManagerImpl( ) direct_ingest_cloud_task_manager.create_direct_ingest_sftp_download_task( region) except FileNotFoundError as e: raise DirectIngestError( msg=f"Region [{region_code}] has no registered manifest", error_type=DirectIngestErrorType.INPUT_ERROR, ) from e return "", HTTPStatus.OK
def handle_new_files() -> Tuple[str, HTTPStatus]: """Normalizes and splits files in the ingest bucket for a given region as is appropriate. Will schedule the next process_job task if no renaming / splitting work has been done that will trigger subsequent calls to this endpoint. """ logging.info('Received request for direct ingest handle_new_files: %s', request.values) region_code = get_str_param_value('region', request.values) can_start_ingest = \ get_bool_param_value('can_start_ingest', request.values, default=False) if not region_code or can_start_ingest is None: return f'Bad parameters [{request.values}]', HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code): try: controller = controller_for_region_code(region_code, allow_unlaunched=True) except DirectIngestError as e: if e.is_bad_request(): return str(e), HTTPStatus.BAD_REQUEST raise e if not isinstance(controller, GcsfsDirectIngestController): raise DirectIngestError( msg=f"Unexpected controller type [{type(controller)}].", error_type=DirectIngestErrorType.INPUT_ERROR) controller.handle_new_files(can_start_ingest=can_start_ingest) return '', HTTPStatus.OK
def handle_direct_ingest_file() -> Tuple[str, HTTPStatus]: """Called from a Cloud Function when a new file is added to a direct ingest bucket. Will trigger a job that deals with normalizing and splitting the file as is appropriate, then start the scheduler if allowed. """ region_code = get_str_param_value("region", request.args) # The bucket name for the file to ingest bucket = get_str_param_value("bucket", request.args) # The relative path to the file, not including the bucket name relative_file_path = get_str_param_value("relative_file_path", request.args, preserve_case=True) start_ingest = get_bool_param_value("start_ingest", request.args, default=False) if not region_code or not bucket or not relative_file_path or start_ingest is None: response = f"Bad parameters [{request.args}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST bucket_path = GcsfsBucketPath(bucket_name=bucket) with monitoring.push_region_tag( region_code, ingest_instance=DirectIngestInstance.for_ingest_bucket( bucket_path).value, ): try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=bucket_path, allow_unlaunched=True, ) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e path = GcsfsPath.from_bucket_and_blob_name( bucket_name=bucket, blob_name=relative_file_path) if isinstance(path, GcsfsFilePath): controller.handle_file(path, start_ingest=start_ingest) return "", HTTPStatus.OK
def state_aggregate() -> Tuple[str, HTTPStatus]: """Calls state aggregates""" bucket = get_str_param_value("bucket", request.args) state = get_str_param_value("state", request.args) filename = get_str_param_value("filename", request.args) project_id = metadata.project_id() logging.info("The project id is %s", project_id) if not bucket or not state or not filename: raise StateAggregateError("All of state, bucket, and filename must be provided") directory_path = GcsfsDirectoryPath(bucket, state) path = GcsfsFilePath.from_directory_and_file_name(directory_path, filename) parser = STATE_TO_PARSER[state] fs = GcsfsFactory.build() logging.info("The path to download from is %s", path) logging.info("The files in the directory are:") logging.info( fs.ls_with_blob_prefix( bucket_name=directory_path.bucket_name, blob_prefix=directory_path.relative_path, ) ) # Providing a stream buffer to tabula reader does not work because it # tries to load the file into the local filesystem, since appengine is a # read only filesystem (except for the tmpdir) we download the file into # the local tmpdir and pass that in. handle = fs.download_to_temp_file(path) if not handle: raise StateAggregateError(f"Unable to download file: {path}") logging.info("Successfully downloaded file from gcs: %s", handle.local_file_path) result = parser(handle.local_file_path) logging.info("Successfully parsed the report") for table, df in result.items(): dao.write_df(table, df) # If we are successful, we want to move the file out of the cloud # function triggered directory, and into the historical path. historical_path = GcsfsFilePath.from_directory_and_file_name( GcsfsDirectoryPath(HISTORICAL_BUCKET.format(project_id), state), filename ) fs.mv(path, historical_path) return "", HTTPStatus.OK
def scrape_aggregate_reports(): """Calls state aggregates""" # Please add new states in alphabetical order state_to_scraper = { "california": ca_aggregate_site_scraper.get_urls_to_download, "colorado": co_aggregate_site_scraper.get_urls_to_download, "florida": fl_aggregate_site_scraper.get_urls_to_download, "georgia": ga_aggregate_site_scraper.get_urls_to_download, "hawaii": hi_aggregate_site_scraper.get_urls_to_download, "kentucky": ky_aggregate_site_scraper.get_urls_to_download, "new_york": ny_aggregate_site_scraper.get_urls_to_download, "tennessee": tn_aggregate_site_scraper.get_urls_to_download, "texas": tx_aggregate_site_scraper.get_urls_to_download, "west_virginia": wv_aggregate_site_scraper.get_urls_to_download, } state = get_str_param_value("state", request.args) # We want to always download the pdf if it is NY because they always have # the same name. always_download = state == "new_york" is_ca = state == "california" is_co = state == "colorado" verify_ssl = state != "kentucky" urls = state_to_scraper[state]() fs = GcsfsFactory.build() logging.info("Scraping all pdfs for %s", state) for url in urls: post_data = None if isinstance(url, tuple): url, post_data = url # We need to append the year of the report to create uniqueness in # the name since california sends post requests with the same url. pdf_name = state if is_ca: pdf_name += str(post_data["year"]) elif is_co: pdf_name = date.today().strftime("colorado-%m-%Y") else: pdf_name = urlparse(url).path.replace("/", "_").lower() historical_path = build_path(HISTORICAL_BUCKET, state, pdf_name) file_to_upload = _get_file_to_upload(historical_path, fs, url, pdf_name, always_download, post_data, verify_ssl) if file_to_upload: upload_path = build_path(UPLOAD_BUCKET, state, pdf_name) fs.upload_from_contents_handle_stream( path=upload_path, contents_handle=file_to_upload, content_type="application/pdf", ) logging.info("Successfully downloaded %s", url) else: logging.info("Skipping %s because the file already exists", url) return "", HTTPStatus.OK
def ingest() -> Tuple[str, HTTPStatus]: manifest_path = get_str_param_value("manifest_path", request.args, preserve_case=True) if not manifest_path: raise exceptions.BadRequest("Parameter `manifest_path` is required.") manual_upload.ingest(GcsfsFactory.build(), GcsfsFilePath.from_absolute_path(manifest_path)) return "", HTTPStatus.OK
def store_single_count_endpoint(): """Endpoint to store a single count""" jid = get_str_param_value('jid', request.args) ethnicity = get_str_param_value('ethnicity', request.args) gender = get_str_param_value('gender', request.args) race = get_str_param_value('race', request.args) count = get_str_param_value('count', request.args) date = get_str_param_value('date', request.args) sc = SingleCount( count=count, ethnicity=ethnicity, gender=gender, race=race, date=date, ) stored = store_single_count(sc, jid) if stored: logging.info("Stored [%d] as [%s] for [%s]", count, ' '.join(filter(None, (race, gender, ethnicity))), jid) return '', HTTPStatus.OK logging.error("Failed to store single count for [%s]", jid) return '', HTTPStatus.INTERNAL_SERVER_ERROR
def handle_new_files() -> Tuple[str, HTTPStatus]: """Normalizes and splits files in the ingest bucket for a given region as is appropriate. Will schedule the next process_job task if no renaming / splitting work has been done that will trigger subsequent calls to this endpoint. """ logging.info("Received request for direct ingest handle_new_files: %s", request.values) region_code = get_str_param_value("region", request.values) can_start_ingest = get_bool_param_value("can_start_ingest", request.values, default=False) bucket = get_str_param_value("bucket", request.values) if not region_code or can_start_ingest is None or not bucket: response = f"Bad parameters [{request.values}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST bucket_path = GcsfsBucketPath(bucket_name=bucket) with monitoring.push_region_tag( region_code, ingest_instance=DirectIngestInstance.for_ingest_bucket( bucket_path).value, ): try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=bucket_path, allow_unlaunched=True, ) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e controller.handle_new_files(can_start_ingest=can_start_ingest) return "", HTTPStatus.OK
def scrape_aggregate_reports(): """Calls state aggregates""" # Please add new states in alphabetical order state_to_scraper = { "california": ca_aggregate_site_scraper.get_urls_to_download, "florida": fl_aggregate_site_scraper.get_urls_to_download, "georgia": ga_aggregate_site_scraper.get_urls_to_download, "hawaii": hi_aggregate_site_scraper.get_urls_to_download, "kentucky": ky_aggregate_site_scraper.get_urls_to_download, "new_york": ny_aggregate_site_scraper.get_urls_to_download, "tennessee": tn_aggregate_site_scraper.get_urls_to_download, "texas": tx_aggregate_site_scraper.get_urls_to_download, } state = get_str_param_value("state", request.args) # We want to always download the pdf if it is NY because they always have # the same name. always_download = state == "new_york" is_ca = state == "california" verify_ssl = state != "kentucky" urls = state_to_scraper[state]() gcp_project = metadata.project_id() historical_bucket = HISTORICAL_BUCKET.format(gcp_project) upload_bucket = UPLOAD_BUCKET.format(gcp_project) fs = gcsfs.GCSFileSystem(project=gcp_project, cache_timeout=GCSFS_NO_CACHING) logging.info("Scraping all pdfs for %s", state) for url in urls: post_data = None if isinstance(url, tuple): url, post_data = url # We need to append the year of the report to create uniqueness in # the name since california sends post requests with the same url. pdf_name = state if is_ca: pdf_name += str(post_data["year"]) else: pdf_name = urlparse(url).path.replace("/", "_").lower() historical_path = os.path.join(historical_bucket, state, pdf_name) file_to_upload = _get_file_to_upload(historical_path, fs, url, pdf_name, always_download, post_data, verify_ssl) if file_to_upload: upload_path = os.path.join(upload_bucket, state, pdf_name) fs.put(file_to_upload, upload_path) logging.info("Successfully downloaded %s", url) else: logging.info("Skipping %s because the file already exists", url) return "", HTTPStatus.OK
def dashboard_export(): """Calls the dashboard export manager. Endpoint path parameters: bucket: A string indicating the GCP cloud storage bucket to export to data_type: A string, either DATAFLOW or STANDARD for the type of data that should be exported """ # The cloud storage bucket to export to bucket = get_str_param_value('bucket', request.args) # Get the type of data to export data_type = get_str_param_value('data_type', request.args) logging.info( "Attempting to export dashboard %s data to cloud storage" " bucket: %s.", data_type, bucket) dashboard_export_manager.export_dashboard_data_to_cloud_storage( bucket, data_type) return '', HTTPStatus.OK
def metric_view_data_export() -> Tuple[str, HTTPStatus]: """Exports data in BigQuery metric views to cloud storage buckets. Example: export/metric_view_data?export_job_name=PO_MONTHLY&state_code=US_ID URL parameters: export_job_name: Name of job to initiate export for (e.g. PO_MONTHLY). state_code: (Optional) State code to initiate export for (e.g. US_ID) State code must be present if the job is not state agnostic. Args: N/A Returns: N/A """ logging.info("Attempting to export view data to cloud storage") export_job_name = get_str_param_value("export_job_name", request.args) state_code = get_str_param_value("state_code", request.args) if not export_job_name: return ( "Missing required export_job_name URL parameter", HTTPStatus.BAD_REQUEST, ) product_configs = ProductConfigs.from_file(path=PRODUCTS_CONFIG_PATH) try: _ = product_configs.get_export_config(export_job_name=export_job_name, state_code=state_code) except BadProductExportSpecificationError as e: logging.exception(e) return str(e), HTTPStatus.BAD_REQUEST export_view_data_to_cloud_storage(export_job_name=export_job_name, state_code=state_code) return "", HTTPStatus.OK
def update_raw_data_latest_views_for_state() -> Tuple[str, HTTPStatus]: """Updates raw data tables for a given state""" logging.info("Received request to do direct ingest raw data update: [%s]", request.values) region_code = get_str_param_value("region", request.values) if not region_code: return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code): bq_client = BigQueryClientImpl(project_id=metadata.project_id()) controller = DirectIngestRawDataTableLatestViewUpdater( region_code, metadata.project_id(), bq_client) controller.update_views_for_state() return "", HTTPStatus.OK
def ingest() -> Tuple[str, HTTPStatus]: manifest_path = get_str_param_value("manifest_path", request.args, preserve_case=True) if not manifest_path: return "Parameter `manifest_path` is required.", HTTPStatus.BAD_REQUEST try: manual_upload.ingest(GcsfsFactory.build(), GcsfsFilePath.from_absolute_path(manifest_path)) except Exception as e: return f"Error ingesting data: '{e}'", HTTPStatus.INTERNAL_SERVER_ERROR return "", HTTPStatus.OK
def start_direct_ingest(): """Schedules direct ingest jobs for the given region, if necessary.""" region_name = get_str_param_value('region', request.args) try: controller = \ direct_ingest_control.controller_for_region_code(region_name) controller.kick_scheduler(just_finished_job=False) except DirectIngestError as e: project_id = metadata.project_id() message = \ f"Error scheduling next ingest job for region [{region_name}] on " \ f"project [{project_id}]: [{str(e)}]" return message, HTTPStatus.INTERNAL_SERVER_ERROR return '', HTTPStatus.OK
def dashboard_export(): """Calls the dashboard export manager. Endpoint path parameters: bucket: A string indicating the GCP cloud storage bucket to export to """ # The cloud storage bucket to export to bucket = get_str_param_value('bucket', request.args) logging.info( "Attempting to export dashboard data to cloud storage" " bucket: %s.", bucket) dashboard_export_manager.export_dashboard_data_to_cloud_storage(bucket) return '', HTTPStatus.OK
def ingest_view_export() -> Tuple[str, HTTPStatus]: """Exports an ingest view from BQ to a file in the region's GCS File System ingest bucket that is ready to be processed and ingested into our Recidiviz DB. """ logging.info("Received request to do direct ingest view export: [%s]", request.values) region_code = get_str_param_value("region", request.values) if not region_code: return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code): json_data = request.get_data(as_text=True) ingest_view_export_args = _parse_cloud_task_args(json_data) if not ingest_view_export_args: raise DirectIngestError( msg="raw_data_import was called with no IngestArgs.", error_type=DirectIngestErrorType.INPUT_ERROR, ) if not isinstance(ingest_view_export_args, GcsfsIngestViewExportArgs): raise DirectIngestError( msg= f"raw_data_import was called with incorrect args type [{type(ingest_view_export_args)}].", error_type=DirectIngestErrorType.INPUT_ERROR, ) with monitoring.push_tags({ TagKey.INGEST_VIEW_EXPORT_TAG: ingest_view_export_args.task_id_tag() }): try: controller = controller_for_region_code(region_code) except DirectIngestError as e: if e.is_bad_request(): return str(e), HTTPStatus.BAD_REQUEST raise e if not isinstance(controller, GcsfsDirectIngestController): raise DirectIngestError( msg=f"Unexpected controller type [{type(controller)}].", error_type=DirectIngestErrorType.INPUT_ERROR, ) controller.do_ingest_view_export(ingest_view_export_args) return "", HTTPStatus.OK
def raw_data_import() -> Tuple[str, HTTPStatus]: """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data table in BQ. """ logging.info("Received request to do direct ingest raw data import: [%s]", request.values) region_code = get_str_param_value("region", request.values) if not region_code: return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code): json_data = request.get_data(as_text=True) data_import_args = _parse_cloud_task_args(json_data) if not data_import_args: raise DirectIngestError( msg="raw_data_import was called with no IngestArgs.", error_type=DirectIngestErrorType.INPUT_ERROR, ) if not isinstance(data_import_args, GcsfsRawDataBQImportArgs): raise DirectIngestError( msg= f"raw_data_import was called with incorrect args type [{type(data_import_args)}].", error_type=DirectIngestErrorType.INPUT_ERROR, ) with monitoring.push_tags( {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}): try: controller = controller_for_region_code(region_code) except DirectIngestError as e: if e.is_bad_request(): return str(e), HTTPStatus.BAD_REQUEST raise e if not isinstance(controller, GcsfsDirectIngestController): raise DirectIngestError( msg=f"Unexpected controller type [{type(controller)}].", error_type=DirectIngestErrorType.INPUT_ERROR, ) controller.do_raw_data_import(data_import_args) return "", HTTPStatus.OK
def process_job() -> Tuple[str, HTTPStatus]: """Processes a single direct ingest file, specified in the provided ingest arguments. """ logging.info("Received request to process direct ingest job: [%s]", request.values) region_code = get_str_param_value("region", request.values) if not region_code: return f"Bad parameters [{request.values}]", HTTPStatus.BAD_REQUEST with monitoring.push_region_tag(region_code): json_data = request.get_data(as_text=True) ingest_args = _parse_cloud_task_args(json_data) if not ingest_args: raise DirectIngestError( msg="process_job was called with no IngestArgs.", error_type=DirectIngestErrorType.INPUT_ERROR, ) if not isinstance(ingest_args, IngestArgs): raise DirectIngestError( msg= f"process_job was called with incorrect args type [{type(ingest_args)}].", error_type=DirectIngestErrorType.INPUT_ERROR, ) if not ingest_args: return "Could not parse ingest args", HTTPStatus.BAD_REQUEST with monitoring.push_tags( {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}): try: controller = controller_for_region_code(region_code) except DirectIngestError as e: if e.is_bad_request(): return str(e), HTTPStatus.BAD_REQUEST raise e try: controller.run_ingest_job_and_kick_scheduler_on_completion( ingest_args) except GCSPseudoLockAlreadyExists as e: return str(e), HTTPStatus.CONFLICT return "", HTTPStatus.OK