def __init__(self, request_id: str): Logging.set_correlation_id(logger, value=request_id) self.request_id = request_id self.request_tracker = RequestTracker(request_id) self.dynamo_handler = DynamoHandler() self.sqs_handler = SQSHandler() self.infra_config = MatrixInfraConfig() self.redshift_config = MatrixRedshiftConfig() self.query_results_bucket = os.environ['MATRIX_QUERY_RESULTS_BUCKET'] self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET'])
def __init__(self, args): self.args = args self.format = args.format self.request_tracker = RequestTracker(args.request_id) self.query_results = {} self.local_output_filename = os.path.basename(os.path.normpath(args.target_path)) self.target_path = args.target_path self.working_dir = args.working_dir self.FS = s3fs.S3FileSystem() Logging.set_correlation_id(LOGGER, value=args.request_id)
def invalidate_cache_entries(request_ids: list = None, request_hashes: list = None): """ Invalidates a list of request IDs and/or request hashes. Invalidation refers to the invalidation of the request in DynamoDB and the deletion of the associated matrix in S3. Invalidated requests will return an `ERROR` state and explanation to the user via the GET endpoint. Request hashes are resolved to a list of associated request IDs. :param request_ids: list of request IDs to invalidate :param request_hashes: list of request hashes to invalidate """ print(f"Invalidating request IDs: {request_ids}") print(f"Invalidating request hashes: {request_hashes}") deployment_stage = os.environ['DEPLOYMENT_STAGE'] dynamo_handler = DynamoHandler() data_version = dynamo_handler.get_table_item( table=DynamoTable.DEPLOYMENT_TABLE, key=deployment_stage)[DeploymentTableField.CURRENT_DATA_VERSION.value] for request_hash in request_hashes: items = dynamo_handler.filter_table_items( table=DynamoTable.REQUEST_TABLE, attrs={ RequestTableField.REQUEST_HASH.value: request_hash, RequestTableField.DATA_VERSION.value: data_version }) for item in items: request_ids.append(item[RequestTableField.REQUEST_ID.value]) s3_keys_to_delete = [] for request_id in request_ids: print(f"Writing deletion error to {request_id} in DynamoDB.") request_tracker = RequestTracker(request_id=request_id) request_tracker.log_error( "This request has been deleted and is no longer available for download. " "Please generate a new matrix at POST /v1/matrix.") s3_keys_to_delete.append(request_tracker.s3_results_key) print(f"Deleting matrices at the following S3 keys: {s3_keys_to_delete}") s3_results_bucket_handler = S3Handler(os.environ['MATRIX_RESULTS_BUCKET']) deleted_objects = s3_results_bucket_handler.delete_objects( s3_keys_to_delete) deleted_keys = [ deleted_object['Key'] for deleted_object in deleted_objects ] print( f"Successfully deleted the following matrices {deleted_keys}. ({len(deleted_keys)}/{len(s3_keys_to_delete)})" )
def run(self, max_loops=None): loops = 0 while max_loops is None or loops < max_loops: loops += 1 messages = self.sqs_handler.receive_messages_from_queue( self.query_job_q_url) if messages: message = messages[0] logger.info(f"Received {message} from {self.query_job_q_url}") payload = json.loads(message['Body']) request_id = payload['request_id'] request_tracker = RequestTracker(request_id) Logging.set_correlation_id(logger, value=request_id) obj_key = payload['s3_obj_key'] receipt_handle = message['ReceiptHandle'] try: logger.info(f"Fetching query from {obj_key}") query = self.s3_handler.load_content_from_obj_key(obj_key) logger.info(f"Running query from {obj_key}") self.redshift_handler.transaction([query], read_only=True) logger.info(f"Finished running query from {obj_key}") logger.info( f"Deleting {message} from {self.query_job_q_url}") self.sqs_handler.delete_message_from_queue( self.query_job_q_url, receipt_handle) logger.info( "Incrementing completed queries in state table") request_tracker.complete_subtask_execution(Subtask.QUERY) if request_tracker.is_request_ready_for_conversion(): logger.info("Scheduling batch conversion job") batch_job_id = self.batch_handler.schedule_matrix_conversion( request_id, request_tracker.format) request_tracker.write_batch_job_id_to_db(batch_job_id) except Exception as e: logger.info( f"QueryRunner failed on {message} with error {e}") request_tracker.log_error(str(e)) logger.info( f"Adding {message} to {self.query_job_deadletter_q_url}" ) self.sqs_handler.add_message_to_queue( self.query_job_deadletter_q_url, payload) logger.info( f"Deleting {message} from {self.query_job_q_url}") self.sqs_handler.delete_message_from_queue( self.query_job_q_url, receipt_handle) else: logger.info(f"No messages to read from {self.query_job_q_url}")
def setUp(self, mock_get_datetime_now): super(TestRequestTracker, self).setUp() self.stub_date = '2019-03-18T180907.136216Z' mock_get_datetime_now.return_value = self.stub_date self.request_id = str(uuid.uuid4()) self.request_tracker = RequestTracker(self.request_id) self.dynamo_handler = DynamoHandler() self.create_test_request_table() self.dynamo_handler.create_request_table_entry(self.request_id, "test_format")
def post_matrix(body: dict): feature = body.get("feature", constants.DEFAULT_FEATURE) fields = body.get("fields", constants.DEFAULT_FIELDS) format_ = body['format'] if 'format' in body else MatrixFormat.LOOM.value expected_formats = [mf.value for mf in MatrixFormat] # Validate input parameters if format_ not in expected_formats: return ({ 'message': "Invalid parameters supplied. " "Please supply a valid `format`. " "Visit https://matrix.dev.data.humancellatlas.org for more information." }, requests.codes.bad_request) if "filter" not in body: return ({ 'message': "Invalid parameters supplied. " "Please supply a filter. " "Visit https://matrix.dev.data.humancellatlas.org for more information." }, requests.codes.bad_request) if len(json.dumps(body["filter"])) > 128000: return ({ 'message': "The filter specification is too large. " "Visit https://matrix.dev.data.humancellatlas.org for more information." }, requests.codes.request_entity_too_large) request_id = str(uuid.uuid4()) RequestTracker(request_id).initialize_request(format_, fields, feature) driver_payload = { 'request_id': request_id, 'filter': body["filter"], 'fields': fields, 'feature': feature } lambda_handler.invoke(LambdaName.DRIVER_V1, driver_payload) return ({ 'request_id': request_id, 'status': MatrixRequestStatus.IN_PROGRESS.value, 'matrix_url': "", 'eta': "", 'message': "Job started." }, requests.codes.accepted)
def setUp(self, mock_get_datetime_now): super(TestRequestTracker, self).setUp() self.stub_date = '2019-03-18T180907.136216Z' mock_get_datetime_now.return_value = self.stub_date self.request_id = str(uuid.uuid4()) self.request_tracker = RequestTracker(self.request_id) self.dynamo_handler = DynamoHandler() self.create_test_data_version_table() self.create_test_deployment_table() self.create_test_request_table() self.create_s3_results_bucket() self.init_test_data_version_table() self.init_test_deployment_table() self.dynamo_handler.create_request_table_entry( self.request_id, "test_format", ["test_field_1", "test_field_2"], "test_feature")
def get_matrix(request_id: str): # There are a few cases to handle here. First, if the request_id is not in # the state table at all, then this id has never been made and we should # 404. request_tracker = RequestTracker(request_id) if not request_tracker.is_initialized: return ({ 'message': f"Unable to find job with request ID {request_id}." }, requests.codes.not_found) in_progress_response = ({ 'request_id': request_id, 'status': MatrixRequestStatus.IN_PROGRESS.value, 'matrix_url': "", 'eta': "", 'message': f"Request {request_id} has been accepted and is currently being " f"processed. Please try again later." }, requests.codes.ok) # if the request tracker is not able to retrieve the format, # it means that the driver has not created the relevant entry in the output table yet. try: format = request_tracker.format except MatrixException: return in_progress_response # Failed case if request_tracker.error: return ({ 'request_id': request_id, 'status': MatrixRequestStatus.FAILED.value, 'matrix_url': "", 'eta': "", 'message': request_tracker.error }, requests.codes.ok) # Check for failed batch conversion job elif request_tracker.batch_job_status and request_tracker.batch_job_status == "FAILED": request_tracker.log_error( "The matrix conversion as a part of the request has failed. \ Please retry or contact an hca admin for help.") return ({ 'request_id': request_id, 'status': MatrixRequestStatus.FAILED.value, 'matrix_url': "", 'eta': "", 'message': request_tracker.error }, requests.codes.ok) # Complete case elif request_tracker.is_request_complete(): matrix_results_bucket = os.environ['MATRIX_RESULTS_BUCKET'] matrix_results_handler = S3Handler(matrix_results_bucket) matrix_key = "" if format == MatrixFormat.LOOM.value: matrix_key = f"{request_tracker.s3_results_prefix}/{request_id}.{format}" elif format == MatrixFormat.CSV.value or format == MatrixFormat.MTX.value: matrix_key = f"{request_tracker.s3_results_prefix}/{request_id}.{format}.zip" matrix_location = f"https://s3.amazonaws.com/{matrix_results_bucket}/{matrix_key}" is_empty = False if not matrix_results_handler.size(matrix_key): is_empty = True matrix_location = "" if not is_empty: message = ( f"Request {request_id} has successfully completed. " f"The resultant expression matrix is available for download at " f"{matrix_location}.") else: message = ( f"Request {request_id} has successfully completed. " f"But, there were no cells associated with this request and " f"species {request_tracker.genus_species.value}") return ({ 'request_id': request_id, 'status': MatrixRequestStatus.COMPLETE.value, 'matrix_url': matrix_location, 'eta': "", 'message': message }, requests.codes.ok) # Expired case elif request_tracker.is_expired: return ({ 'request_id': request_id, 'status': MatrixRequestStatus.EXPIRED.value, 'matrix_url': "", 'eta': "", 'message': request_tracker.error }, requests.codes.ok) # Timeout case elif request_tracker.timeout: return ({ 'request_id': request_id, 'status': MatrixRequestStatus.FAILED.value, 'matrix_url': "", 'eta': "", 'message': request_tracker.error }, requests.codes.ok) else: return in_progress_response
def post_matrix(body: dict): feature = body.get("feature", constants.DEFAULT_FEATURE) fields = body.get("fields", constants.DEFAULT_FIELDS) format_ = body['format'] if 'format' in body else MatrixFormat.LOOM.value expected_formats = [mf.value for mf in MatrixFormat] # Validate input parameters if format_ not in expected_formats: return ({ 'message': "Invalid parameters supplied. " "Please supply a valid `format`. " "Visit https://matrix.dev.data.humancellatlas.org for more information." }, requests.codes.bad_request) if "filter" not in body: return ({ 'message': "Invalid parameters supplied. " "Please supply a filter. " "Visit https://matrix.dev.data.humancellatlas.org for more information." }, requests.codes.bad_request) if len(json.dumps(body["filter"])) > 128000: return ({ 'message': "The filter specification is too large. " "Visit https://matrix.dev.data.humancellatlas.org for more information." }, requests.codes.request_entity_too_large) if query_constructor.has_genus_species_term(body["filter"]): # If the user has mentioned something about species, then maybe # they're looking for non-human data. So we'll run queries for all # the species that we know about. genera_species = list(constants.GenusSpecies) else: # Otherwise, default to human-only genera_species = [constants.GenusSpecies.HUMAN] # Cell barcode is required in .mtx matrices by 10x specifications if format_ == MatrixFormat.MTX.value and "cell.barcode" not in fields and "barcode" not in fields: fields.append("cell.barcode") human_request_id = "" non_human_request_ids = {} for genus_species in genera_species: request_id = str(uuid.uuid4()) RequestTracker(request_id).initialize_request(format_, fields, feature, genus_species) driver_payload = { 'request_id': request_id, 'filter': body["filter"], 'fields': fields, 'feature': feature, 'genus_species': genus_species.value } lambda_handler.invoke(LambdaName.DRIVER_V1, driver_payload) if genus_species == GenusSpecies.HUMAN: human_request_id = request_id else: non_human_request_ids[genus_species.value] = request_id return ({ 'request_id': human_request_id, 'non_human_request_ids': non_human_request_ids, 'status': MatrixRequestStatus.IN_PROGRESS.value, 'message': "Job started." }, requests.codes.accepted)
def test_is_initialized(self): self.assertTrue(self.request_tracker.is_initialized) new_request_tracker = RequestTracker("test_uuid") self.assertFalse(new_request_tracker.is_initialized)
def post_matrix(body: dict): has_ids = 'bundle_fqids' in body has_url = 'bundle_fqids_url' in body format = body['format'] if 'format' in body else MatrixFormat.LOOM.value expected_formats = [mf.value for mf in MatrixFormat] # Validate input parameters if format not in expected_formats: return ({ 'message': "Invalid parameters supplied. " "Please supply a valid `format`. " "Visit https://matrix.data.humancellatlas.org for more information." }, requests.codes.bad_request) if has_ids and has_url: return ({ 'message': "Invalid parameters supplied. " "Please supply either one of `bundle_fqids` or `bundle_fqids_url`. " "Visit https://matrix.data.humancellatlas.org for more information." }, requests.codes.bad_request) if not has_ids and not has_url: return ({ 'message': "Invalid parameters supplied. " "One of `bundle_fqids` or `bundle_fqids_url` must be supplied. " "Visit https://matrix.data.humancellatlas.org for more information." }, requests.codes.bad_request) if not has_url and len(json.dumps(body['bundle_fqids'])) > 128000: return ({ 'message': "List of bundle fqids is too large. " "Consider using bundle_fqids_url instead. " "Visit https://matrix.data.humancellatlas.org for more information." }, requests.codes.request_entity_too_large) if has_url: bundle_fqids_url = body['bundle_fqids_url'] bundle_fqids = None else: bundle_fqids = body['bundle_fqids'] bundle_fqids_url = None if len(bundle_fqids) == 0: return ({ 'message': "Invalid parameters supplied. " "Please supply non empty `bundle_fqids`. " "Visit https://matrix.data.humancellatlas.org for more information." }, requests.codes.bad_request) request_id = str(uuid.uuid4()) RequestTracker(request_id).initialize_request(format) driver_payload = { 'request_id': request_id, 'bundle_fqids': bundle_fqids, 'bundle_fqids_url': bundle_fqids_url, 'format': format, } lambda_handler.invoke(LambdaName.DRIVER_V0, driver_payload) return ({ 'request_id': request_id, 'status': MatrixRequestStatus.IN_PROGRESS.value, 'matrix_url': "", 'eta': "", 'message': "Job started." }, requests.codes.accepted)