def is_request_complete(self) -> bool: """ Checks whether the request has completed. :return: bool True if complete, else False """ results_bucket = S3Handler(os.environ['MATRIX_RESULTS_BUCKET']) return results_bucket.exists(self.s3_results_key)
def test_is_request_complete(self): self.assertFalse(self.request_tracker.is_request_complete()) s3_handler = S3Handler(os.environ['MATRIX_RESULTS_BUCKET']) s3_handler.store_content_in_s3( f"{self.request_tracker.s3_results_key}/{self.request_id}.{self.request_tracker.format}", "") self.assertTrue(self.request_tracker.is_request_complete())
def __init__(self, request_id: str): Logging.set_correlation_id(logger, value=request_id) self.request_id = request_id self.request_tracker = RequestTracker(request_id) self.dynamo_handler = DynamoHandler() self.sqs_handler = SQSHandler() self.infra_config = MatrixInfraConfig() self.redshift_config = MatrixRedshiftConfig() self.query_results_bucket = os.environ['MATRIX_QUERY_RESULTS_BUCKET'] self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET'])
def lookup_cached_result(self) -> str: """ Retrieves the S3 key of an existing matrix result that corresponds to this request's request hash. Returns "" if no such result exists :return: S3 key of cached result """ results_bucket = S3Handler(os.environ['MATRIX_RESULTS_BUCKET']) objects = results_bucket.ls(f"{self.s3_results_prefix}/") if len(objects) > 0: return objects[0]['Key'] return ""
def invalidate_cache_entries(request_ids: list = None, request_hashes: list = None): """ Invalidates a list of request IDs and/or request hashes. Invalidation refers to the invalidation of the request in DynamoDB and the deletion of the associated matrix in S3. Invalidated requests will return an `ERROR` state and explanation to the user via the GET endpoint. Request hashes are resolved to a list of associated request IDs. :param request_ids: list of request IDs to invalidate :param request_hashes: list of request hashes to invalidate """ print(f"Invalidating request IDs: {request_ids}") print(f"Invalidating request hashes: {request_hashes}") deployment_stage = os.environ['DEPLOYMENT_STAGE'] dynamo_handler = DynamoHandler() data_version = dynamo_handler.get_table_item( table=DynamoTable.DEPLOYMENT_TABLE, key=deployment_stage)[DeploymentTableField.CURRENT_DATA_VERSION.value] for request_hash in request_hashes: items = dynamo_handler.filter_table_items( table=DynamoTable.REQUEST_TABLE, attrs={ RequestTableField.REQUEST_HASH.value: request_hash, RequestTableField.DATA_VERSION.value: data_version }) for item in items: request_ids.append(item[RequestTableField.REQUEST_ID.value]) s3_keys_to_delete = [] for request_id in request_ids: print(f"Writing deletion error to {request_id} in DynamoDB.") request_tracker = RequestTracker(request_id=request_id) request_tracker.log_error( "This request has been deleted and is no longer available for download. " "Please generate a new matrix at POST /v1/matrix.") s3_keys_to_delete.append(request_tracker.s3_results_key) print(f"Deleting matrices at the following S3 keys: {s3_keys_to_delete}") s3_results_bucket_handler = S3Handler(os.environ['MATRIX_RESULTS_BUCKET']) deleted_objects = s3_results_bucket_handler.delete_objects( s3_keys_to_delete) deleted_keys = [ deleted_object['Key'] for deleted_object in deleted_objects ] print( f"Successfully deleted the following matrices {deleted_keys}. ({len(deleted_keys)}/{len(s3_keys_to_delete)})" )
def test_lookup_cached_result(self, mock_s3_results_prefix): mock_s3_results_prefix.return_value = "test_prefix" s3_handler = S3Handler(os.environ['MATRIX_RESULTS_BUCKET']) with self.subTest("Do not match in S3 'directories'"): s3_handler.store_content_in_s3("test_prefix", "test_content") self.assertEqual(self.request_tracker.lookup_cached_result(), "") with self.subTest("Successfully retrieve a result key"): s3_handler.store_content_in_s3("test_prefix/test_result_1", "test_content") s3_handler.store_content_in_s3("test_prefix/test_result_2", "test_content") self.assertEqual(self.request_tracker.lookup_cached_result(), "test_prefix/test_result_1")
def is_expired(self): """ Whether or not the request has expired and the matrix in S3 has been deleted. :return: bool """ s3_results_bucket_handler = S3Handler( os.environ['MATRIX_RESULTS_BUCKET']) is_past_expiration = date.to_datetime( self.creation_date) < date.get_datetime_now() - timedelta(days=30) is_expired = not s3_results_bucket_handler.exists( self.s3_results_key) and is_past_expiration if is_expired: self.log_error( "This request has expired after 30 days and is no longer available for download. " "A new matrix can be generated by resubmitting the POST request to /v1/matrix." ) return is_expired
def get_matrix(request_id: str): # There are a few cases to handle here. First, if the request_id is not in # the state table at all, then this id has never been made and we should # 404. request_tracker = RequestTracker(request_id) if not request_tracker.is_initialized: return ({ 'message': f"Unable to find job with request ID {request_id}." }, requests.codes.not_found) in_progress_response = ({ 'request_id': request_id, 'status': MatrixRequestStatus.IN_PROGRESS.value, 'matrix_url': "", 'eta': "", 'message': f"Request {request_id} has been accepted and is currently being " f"processed. Please try again later." }, requests.codes.ok) # if the request tracker is not able to retrieve the format, # it means that the driver has not created the relevant entry in the output table yet. try: format = request_tracker.format except MatrixException: return in_progress_response # Failed case if request_tracker.error: return ({ 'request_id': request_id, 'status': MatrixRequestStatus.FAILED.value, 'matrix_url': "", 'eta': "", 'message': request_tracker.error }, requests.codes.ok) # Check for failed batch conversion job elif request_tracker.batch_job_status and request_tracker.batch_job_status == "FAILED": request_tracker.log_error( "The matrix conversion as a part of the request has failed. \ Please retry or contact an hca admin for help.") return ({ 'request_id': request_id, 'status': MatrixRequestStatus.FAILED.value, 'matrix_url': "", 'eta': "", 'message': request_tracker.error }, requests.codes.ok) # Complete case elif request_tracker.is_request_complete(): matrix_results_bucket = os.environ['MATRIX_RESULTS_BUCKET'] matrix_results_handler = S3Handler(matrix_results_bucket) matrix_key = "" if format == MatrixFormat.LOOM.value: matrix_key = f"{request_tracker.s3_results_prefix}/{request_id}.{format}" elif format == MatrixFormat.CSV.value or format == MatrixFormat.MTX.value: matrix_key = f"{request_tracker.s3_results_prefix}/{request_id}.{format}.zip" matrix_location = f"https://s3.amazonaws.com/{matrix_results_bucket}/{matrix_key}" is_empty = False if not matrix_results_handler.size(matrix_key): is_empty = True matrix_location = "" if not is_empty: message = ( f"Request {request_id} has successfully completed. " f"The resultant expression matrix is available for download at " f"{matrix_location}.") else: message = ( f"Request {request_id} has successfully completed. " f"But, there were no cells associated with this request and " f"species {request_tracker.genus_species.value}") return ({ 'request_id': request_id, 'status': MatrixRequestStatus.COMPLETE.value, 'matrix_url': matrix_location, 'eta': "", 'message': message }, requests.codes.ok) # Expired case elif request_tracker.is_expired: return ({ 'request_id': request_id, 'status': MatrixRequestStatus.EXPIRED.value, 'matrix_url': "", 'eta': "", 'message': request_tracker.error }, requests.codes.ok) # Timeout case elif request_tracker.timeout: return ({ 'request_id': request_id, 'status': MatrixRequestStatus.FAILED.value, 'matrix_url': "", 'eta': "", 'message': request_tracker.error }, requests.codes.ok) else: return in_progress_response
def setUp(self): super(TestS3Handler, self).setUp() self.create_s3_queries_bucket() self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET']) self.request_id = str(uuid.uuid4())
def test_invalidate_cache_entries(self, mock_put_metric_data): """ Setup: - Create four request ids mapping to two request hashes - Invalidate hash 1 (ids 1, 2) and id 3 - Verify ids 1, 2 and 3 have been invalidated - Verify id 4 has not been invalidated """ request_hash_1 = "test_hash_1" request_hash_2 = "test_hash_2" request_id_1 = "test_id_1" request_id_2 = "test_id_2" request_id_3 = "test_id_3" request_id_4 = "test_id_4" test_format = "test_format" test_content = "test_content" s3_key_1 = f"0/{request_hash_1}/{request_id_1}.{test_format}" s3_key_2 = f"0/{request_hash_1}/{request_id_2}.{test_format}" s3_key_3 = f"0/{request_hash_2}/{request_id_3}.{test_format}" s3_key_4 = f"0/{request_hash_2}/{request_id_4}.{test_format}" dynamo_handler = DynamoHandler() dynamo_handler.create_request_table_entry(request_id_1, test_format) dynamo_handler.create_request_table_entry(request_id_2, test_format) dynamo_handler.create_request_table_entry(request_id_3, test_format) dynamo_handler.create_request_table_entry(request_id_4, test_format) dynamo_handler.set_table_field_with_value( table=DynamoTable.REQUEST_TABLE, key=request_id_1, field_enum=RequestTableField.REQUEST_HASH, field_value=request_hash_1) dynamo_handler.set_table_field_with_value( table=DynamoTable.REQUEST_TABLE, key=request_id_2, field_enum=RequestTableField.REQUEST_HASH, field_value=request_hash_1) dynamo_handler.set_table_field_with_value( table=DynamoTable.REQUEST_TABLE, key=request_id_3, field_enum=RequestTableField.REQUEST_HASH, field_value=request_hash_2) dynamo_handler.set_table_field_with_value( table=DynamoTable.REQUEST_TABLE, key=request_id_4, field_enum=RequestTableField.REQUEST_HASH, field_value=request_hash_2) s3_results_bucket_handler = S3Handler( os.environ['MATRIX_RESULTS_BUCKET']) s3_results_bucket_handler.store_content_in_s3(s3_key_1, test_content) s3_results_bucket_handler.store_content_in_s3(s3_key_2, test_content) s3_results_bucket_handler.store_content_in_s3(s3_key_3, test_content) s3_results_bucket_handler.store_content_in_s3(s3_key_4, test_content) self.assertTrue(s3_results_bucket_handler.exists(s3_key_1)) self.assertTrue(s3_results_bucket_handler.exists(s3_key_2)) self.assertTrue(s3_results_bucket_handler.exists(s3_key_3)) self.assertTrue(s3_results_bucket_handler.exists(s3_key_4)) invalidate_cache_entries(request_ids=[request_id_3], request_hashes=[request_hash_1]) error_1 = dynamo_handler.get_table_item( table=DynamoTable.REQUEST_TABLE, key=request_id_1)[RequestTableField.ERROR_MESSAGE.value] error_2 = dynamo_handler.get_table_item( table=DynamoTable.REQUEST_TABLE, key=request_id_2)[RequestTableField.ERROR_MESSAGE.value] error_3 = dynamo_handler.get_table_item( table=DynamoTable.REQUEST_TABLE, key=request_id_3)[RequestTableField.ERROR_MESSAGE.value] error_4 = dynamo_handler.get_table_item( table=DynamoTable.REQUEST_TABLE, key=request_id_4)[RequestTableField.ERROR_MESSAGE.value] self.assertFalse(s3_results_bucket_handler.exists(s3_key_1)) self.assertFalse(s3_results_bucket_handler.exists(s3_key_2)) self.assertFalse(s3_results_bucket_handler.exists(s3_key_3)) self.assertTrue(s3_results_bucket_handler.exists(s3_key_4)) self.assertNotEqual(error_1, 0) self.assertNotEqual(error_2, 0) self.assertNotEqual(error_3, 0) self.assertEqual(error_4, 0)
def __init__(self): self.sqs_handler = SQSHandler() self.s3_handler = S3Handler(os.environ["MATRIX_QUERY_BUCKET"]) self.batch_handler = BatchHandler() self.redshift_handler = RedshiftHandler() self.matrix_infra_config = MatrixInfraConfig()
def run(self, max_loops=None): loops = 0 while max_loops is None or loops < max_loops: loops += 1 messages = self.sqs_handler.receive_messages_from_queue( self.query_job_q_url) if messages: message = messages[0] logger.info(f"Received {message} from {self.query_job_q_url}") payload = json.loads(message['Body']) request_id = payload['request_id'] request_tracker = RequestTracker(request_id) Logging.set_correlation_id(logger, value=request_id) obj_key = payload['s3_obj_key'] query_type = payload['type'] receipt_handle = message['ReceiptHandle'] try: logger.info(f"Fetching query from {obj_key}") query = self.s3_handler.load_content_from_obj_key(obj_key) logger.info(f"Running query from {obj_key}") self.redshift_handler.transaction([query], read_only=True) logger.info(f"Finished running query from {obj_key}") logger.info( f"Deleting {message} from {self.query_job_q_url}") self.sqs_handler.delete_message_from_queue( self.query_job_q_url, receipt_handle) if query_type == QueryType.CELL.value: cached_result_s3_key = request_tracker.lookup_cached_result( ) if cached_result_s3_key: s3 = S3Handler(os.environ['MATRIX_RESULTS_BUCKET']) s3.copy_obj(cached_result_s3_key, request_tracker.s3_results_key) continue logger.info( "Incrementing completed queries in state table") request_tracker.complete_subtask_execution(Subtask.QUERY) if request_tracker.is_request_ready_for_conversion(): logger.info("Scheduling batch conversion job") batch_job_id = self.batch_handler.schedule_matrix_conversion( request_id, request_tracker.format, request_tracker.s3_results_key) request_tracker.write_batch_job_id_to_db(batch_job_id) except Exception as e: logger.info( f"QueryRunner failed on {message} with error {e}") request_tracker.log_error(str(e)) logger.error(traceback.format_exc()) logger.info( f"Adding {message} to {self.query_job_deadletter_q_url}" ) self.sqs_handler.add_message_to_queue( self.query_job_deadletter_q_url, payload) logger.info( f"Deleting {message} from {self.query_job_q_url}") self.sqs_handler.delete_message_from_queue( self.query_job_q_url, receipt_handle) else: logger.info(f"No messages to read from {self.query_job_q_url}")