class HealthCheck: def __init__(self): self.env = os.environ['DEPLOYMENT_STAGE'] self.db = UploadDB() logger.debug( f"Running a health check for {self.env}. Results will be posted in #upload-service" ) self.webhook = UploadConfig().slack_webhook self.stale_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \ "WHERE status='CHECKSUMMING' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.stale_validation_job_count_query = "SELECT COUNT(*) FROM validation " \ "WHERE status='VALIDATING' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.scheduled_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \ "WHERE status='SCHEDULED' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.scheduled_validation_job_count_query = "SELECT COUNT(*) FROM validation " \ "WHERE status='SCHEDULED' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.undeleted_areas_count_query = "SELECT COUNT(*) FROM upload_area " \ "WHERE created_at > CURRENT_DATE - interval '4 weeks' " \ "AND status != 'DELETED'" self.failed_checksum_count_query = "SELECT COUNT(*) FROM checksum " \ "WHERE status='FAILED' " \ "AND updated_at >= NOW() - '1 day'::INTERVAL" self.failed_validation_count_query = "SELECT COUNT(*) FROM validation " \ "WHERE status='FAILED' " \ "AND updated_at >= NOW() - '1 day'::INTERVAL" self.deadletter_metric_queries = [{ 'Id': 'visible_messages', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/SQS', 'MetricName': 'ApproximateNumberOfMessagesVisible', 'Dimensions': [{ 'Name': 'QueueName', 'Value': f'dcp-upload-pre-csum-deadletter-queue-{self.env}' }] }, 'Period': 90000, 'Stat': 'Average' } }, { 'Id': 'received_messages', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/SQS', 'MetricName': 'NumberOfMessagesReceived', 'Dimensions': [{ 'Name': 'QueueName', 'Value': f'dcp-upload-pre-csum-deadletter-queue-{self.env}' }] }, 'Period': 90000, 'Stat': 'Average' } }] self.lambda_error_queries = [{ 'Id': 'upload_api_lambda_errors', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/Lambda', 'MetricName': 'Errors', 'Dimensions': [{ 'Name': 'FunctionName', 'Value': f'upload-api-{self.env}' }] }, 'Period': 90000, 'Stat': 'Sum' } }, { 'Id': 'checksum_daemon_lambda_errors', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/Lambda', 'MetricName': 'Errors', 'Dimensions': [{ 'Name': 'FunctionName', 'Value': f'dcp-upload-csum-{self.env}' }] }, 'Period': 90000, 'Stat': 'Sum' } }] def run_upload_service_health_check(self): deadletter_queue_info = self.generate_deadletter_queue_status() upload_area_info = self.generate_upload_area_status() lambda_info = self.generate_lambda_error_status() if deadletter_queue_info == upload_area_info == lambda_info == 'GOOD\n': color = 'good' status_info = "It's 6 o'clock somewhere and all is well" else: color = 'bad' status_info = (f"DEADLETTER_QUEUE: {deadletter_queue_info}" + f"UPLOAD_AREAS: {upload_area_info}" + f"LAMBDAS: {lambda_info}") attachments = [{ "title": f"Health Check Report for {self.env}:", "color": color, "text": status_info }] self.post_message_to_url(self.webhook, {"attachments": attachments}) def generate_deadletter_queue_status(self): deadletter_results = self._query_cloudwatch_metrics_for_past_day( self.deadletter_metric_queries) if deadletter_results['received_messages'] == 0: deadletter_queue_status = "GOOD\n" else: deadletter_queue_status = f"{deadletter_results['visible_messages']} in queue, " \ f"{deadletter_results['received_messages']} added in past 24 hrs\n" return deadletter_queue_status def generate_lambda_error_status(self): lambda_error_results = self._query_cloudwatch_metrics_for_past_day( self.lambda_error_queries) if lambda_error_results['upload_api_lambda_errors'] == 0 and \ lambda_error_results['checksum_daemon_lambda_errors'] == 0: lambda_error_status = 'GOOD\n' else: lambda_error_status = f"{lambda_error_results['upload_api_lambda_errors']} errors for Upload API, " \ f"{lambda_error_results['checksum_daemon_lambda_errors']} errors for csum daemon\n" return lambda_error_status def generate_upload_area_status(self): undeleted_upload_area_count = self._query_db_and_return_first_row( self.undeleted_areas_count_query) stale_checksumming_areas = self._query_db_and_return_first_row( self.stale_checksum_job_count_query) stale_validating_areas = self._query_db_and_return_first_row( self.stale_validation_job_count_query) scheduled_checksum_areas = self._query_db_and_return_first_row( self.scheduled_checksum_job_count_query) scheduled_validation_areas = self._query_db_and_return_first_row( self.scheduled_validation_job_count_query) failed_checksum_count = self._query_db_and_return_first_row( self.failed_checksum_count_query) failed_validation_count = self._query_db_and_return_first_row( self.failed_validation_count_query) if (stale_checksumming_areas + stale_validating_areas + scheduled_checksum_areas + scheduled_validation_areas + failed_checksum_count + failed_validation_count) == 0: upload_area_status = 'GOOD\n' else: upload_area_status = f"{undeleted_upload_area_count} undeleted areas, {stale_checksumming_areas}" \ f" stuck in checksumming, {stale_validating_areas} stuck in validation \n" \ f"{scheduled_checksum_areas} files scheduled for checksumming, " \ f"{scheduled_validation_areas} files scheduled for validation (for over 2 hours)\n" \ f"{failed_checksum_count} files failed batch checksumming in last day\n" \ f"{failed_validation_count} files failed batch validation in last day\n" return upload_area_status def post_message_to_url(self, url, message): body = json.dumps(message) headers = {'Content-Type': 'application/json'} requests.post(url=url, data=body, headers=headers) def _query_cloudwatch_metrics_for_past_day(self, metric_data_queries): now = datetime.utcnow() yesterday = now - timedelta(hours=24) response = client.get_metric_data( MetricDataQueries=metric_data_queries, StartTime=yesterday, EndTime=now) results = {} for info in response['MetricDataResults']: if len(info['Values']) > 0: results[info['Id']] = int(info['Values'][0]) else: results[info['Id']] = "no value returned" return results def _query_db_and_return_first_row(self, query): query_result = self.db.run_query(query) rows = query_result.fetchall() if len(rows) > 0: results = rows[0][0] return results
class BatchWatcher: def __init__(self): self.api_key = os.environ["INGEST_API_KEY"] self.deployment_stage = os.environ["DEPLOYMENT_STAGE"] self.api_host = os.environ["API_HOST"] self.batch_client = boto3.client("batch") self.ec2_client = boto3.client('ec2') self.lambda_client = boto3.client('lambda') self.db = UploadDB() def run(self): incomplete_checksum_jobs, incomplete_validation_jobs = self.find_incomplete_batch_jobs( ) logger.info( f"Found {len(incomplete_checksum_jobs)} incomplete checksum jobs utilizing batch" ) logger.info( f"Found {len(incomplete_validation_jobs)} incomplete validation jobs utilizing batch" ) incomplete_jobs = incomplete_checksum_jobs + incomplete_validation_jobs kill_instances = self.should_instances_be_killed(incomplete_jobs) if kill_instances: self.find_and_kill_deployment_batch_instances() # Re fetch incomplete checksum and validation jobs after killing instances to catch newly scheduled incomplete_checksum_jobs, incomplete_validation_jobs = self.find_incomplete_batch_jobs( ) for row in incomplete_validation_jobs: self.schedule_job(row, "validation") for row in incomplete_checksum_jobs: self.schedule_job(row, "checksum") logger.info( f"Finished rescheduling {len(incomplete_validation_jobs)} validation jobs and \ {len(incomplete_checksum_jobs)} checksum jobs") else: logger.info( "No new failed jobs detected in batch. Jobs will continue untouched." ) def should_instances_be_killed(self, rows): kill_instances = False for row in rows: db_id = row["id"] job_id = row["job_id"] file_id = row["file_id"] status = self._get_job_status(job_id) if status == "FAILED": logger.info( f"database record id {db_id} for file {file_id} represents a failed batch job. \ Time to kill instances.") kill_instances = True break return kill_instances @retry_on_aws_too_many_requests def _get_job_status(self, job_id): response = self.batch_client.describe_jobs(jobs=[job_id]) jobs = response.get("jobs") if jobs and len(jobs): status = jobs[0]["status"] return status def find_incomplete_batch_jobs(self): validation_results = self.db.run_query( "SELECT * from validation " "WHERE status = 'SCHEDULED' or status = 'VALIDATING';") validation_rows = validation_results.fetchall() checksum_results = self.db.run_query( "SELECT * from checksum " "WHERE(status='SCHEDULED' or status = 'CHECKSUMMING') " "and job_id is not null;") checksum_rows = checksum_results.fetchall() return checksum_rows, validation_rows def find_and_kill_deployment_batch_instances(self): instance_ids = [] key_name = f"hca-upload-{self.deployment_stage}" reservations = self.ec2_client.describe_instances( Filters=[{ 'Name': 'key-name', 'Values': [key_name] }, { 'Name': 'instance-state-name', 'Values': ["running"] }]) instance_groups = [ x["Instances"] for x in reservations["Reservations"] ] for group in instance_groups: for instance in group: instance_ids.append(instance['InstanceId']) if len(instance_ids): logger.info( f"Killing instances associated with key {key_name} and ec2 ids {str(instance_ids)}" ) self.ec2_client.terminate_instances(InstanceIds=instance_ids) return instance_ids def schedule_job(self, row, table_name): db_id = row["id"] file_id = row["file_id"] file_id_split = file_id.split("/") upload_area_id = file_id_split[0] file_name = file_id_split[1] if table_name == "checksum": self.invoke_checksum_lambda(file_id) elif table_name == "validation": docker_image = row["docker_image"] # Multiple validation attempts on a file should point to the same original validation id original_validation_id = row["original_validation_id"] if not original_validation_id: # If there is no original_validation_id, # set the db id of first validation attempt as original_validation_id. original_validation_id = db_id self.schedule_validation_job(upload_area_id, file_name, docker_image, original_validation_id) logger.info( f"Marking {table_name} record id {db_id} for file {file_id} as failed." ) self.db.run_query_with_params( f"UPDATE {table_name} SET status = 'FAILED' \ WHERE id = %s;", (db_id)) def schedule_validation_job(self, upload_area_id, file_name, docker_image, original_validation_id): headers = {'Api-Key': self.api_key} message = { "validator_image": docker_image, "original_validation_id": original_validation_id } response = requests.put(self.api_host, headers=headers, json=message) if response.status_code == requests.codes.ok: logger.info( f"scheduled {upload_area_id}/{file_name} for validation") else: raise UploadException( f"Failed to schedule {upload_area_id}/{file_name} for validation" ) def invoke_checksum_lambda(self, file_id): payload = { 'Records': [{ 'eventName': 'ObjectCreated:Put', "s3": { "bucket": { "name": f"org-humancellatlas-upload-{self.deployment_stage}" }, "object": { "key": file_id } } }] } self.lambda_client.invoke( FunctionName=f"dcp-upload-csum-{self.deployment_stage}", InvocationType='Event', Payload=json.dumps(payload).encode()) logger.info(f"scheduled {file_id} for checksumming")