コード例 #1
0
ファイル: driver.py プロジェクト: ambrosejcarr/matrix-service
    def __init__(self, request_id: str):
        Logging.set_correlation_id(logger, value=request_id)

        self.request_id = request_id
        self.request_tracker = RequestTracker(request_id)
        self.dynamo_handler = DynamoHandler()
        self.sqs_handler = SQSHandler()
        self.infra_config = MatrixInfraConfig()
        self.redshift_config = MatrixRedshiftConfig()
        self.query_results_bucket = os.environ['MATRIX_QUERY_RESULTS_BUCKET']
        self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET'])
コード例 #2
0
    def __init__(self, args):
        self.args = args
        self.format = args.format
        self.request_tracker = RequestTracker(args.request_id)
        self.query_results = {}

        self.local_output_filename = os.path.basename(os.path.normpath(args.target_path))
        self.target_path = args.target_path
        self.working_dir = args.working_dir
        self.FS = s3fs.S3FileSystem()

        Logging.set_correlation_id(LOGGER, value=args.request_id)
コード例 #3
0
def invalidate_cache_entries(request_ids: list = None,
                             request_hashes: list = None):
    """
    Invalidates a list of request IDs and/or request hashes.
    Invalidation refers to the invalidation of the request in DynamoDB
    and the deletion of the associated matrix in S3.

    Invalidated requests will return an `ERROR` state and explanation
    to the user via the GET endpoint.

    Request hashes are resolved to a list of associated request IDs.
    :param request_ids: list of request IDs to invalidate
    :param request_hashes: list of request hashes to invalidate
    """
    print(f"Invalidating request IDs: {request_ids}")
    print(f"Invalidating request hashes: {request_hashes}")
    deployment_stage = os.environ['DEPLOYMENT_STAGE']
    dynamo_handler = DynamoHandler()
    data_version = dynamo_handler.get_table_item(
        table=DynamoTable.DEPLOYMENT_TABLE,
        key=deployment_stage)[DeploymentTableField.CURRENT_DATA_VERSION.value]
    for request_hash in request_hashes:
        items = dynamo_handler.filter_table_items(
            table=DynamoTable.REQUEST_TABLE,
            attrs={
                RequestTableField.REQUEST_HASH.value: request_hash,
                RequestTableField.DATA_VERSION.value: data_version
            })
        for item in items:
            request_ids.append(item[RequestTableField.REQUEST_ID.value])

    s3_keys_to_delete = []
    for request_id in request_ids:
        print(f"Writing deletion error to {request_id} in DynamoDB.")
        request_tracker = RequestTracker(request_id=request_id)
        request_tracker.log_error(
            "This request has been deleted and is no longer available for download. "
            "Please generate a new matrix at POST /v1/matrix.")
        s3_keys_to_delete.append(request_tracker.s3_results_key)

    print(f"Deleting matrices at the following S3 keys: {s3_keys_to_delete}")
    s3_results_bucket_handler = S3Handler(os.environ['MATRIX_RESULTS_BUCKET'])
    deleted_objects = s3_results_bucket_handler.delete_objects(
        s3_keys_to_delete)

    deleted_keys = [
        deleted_object['Key'] for deleted_object in deleted_objects
    ]

    print(
        f"Successfully deleted the following matrices {deleted_keys}. ({len(deleted_keys)}/{len(s3_keys_to_delete)})"
    )
コード例 #4
0
    def run(self, max_loops=None):
        loops = 0
        while max_loops is None or loops < max_loops:
            loops += 1
            messages = self.sqs_handler.receive_messages_from_queue(
                self.query_job_q_url)
            if messages:
                message = messages[0]
                logger.info(f"Received {message} from {self.query_job_q_url}")
                payload = json.loads(message['Body'])
                request_id = payload['request_id']
                request_tracker = RequestTracker(request_id)
                Logging.set_correlation_id(logger, value=request_id)
                obj_key = payload['s3_obj_key']
                receipt_handle = message['ReceiptHandle']
                try:
                    logger.info(f"Fetching query from {obj_key}")
                    query = self.s3_handler.load_content_from_obj_key(obj_key)

                    logger.info(f"Running query from {obj_key}")
                    self.redshift_handler.transaction([query], read_only=True)
                    logger.info(f"Finished running query from {obj_key}")

                    logger.info(
                        f"Deleting {message} from {self.query_job_q_url}")
                    self.sqs_handler.delete_message_from_queue(
                        self.query_job_q_url, receipt_handle)

                    logger.info(
                        "Incrementing completed queries in state table")
                    request_tracker.complete_subtask_execution(Subtask.QUERY)

                    if request_tracker.is_request_ready_for_conversion():
                        logger.info("Scheduling batch conversion job")
                        batch_job_id = self.batch_handler.schedule_matrix_conversion(
                            request_id, request_tracker.format)
                        request_tracker.write_batch_job_id_to_db(batch_job_id)
                except Exception as e:
                    logger.info(
                        f"QueryRunner failed on {message} with error {e}")
                    request_tracker.log_error(str(e))
                    logger.info(
                        f"Adding {message} to {self.query_job_deadletter_q_url}"
                    )
                    self.sqs_handler.add_message_to_queue(
                        self.query_job_deadletter_q_url, payload)
                    logger.info(
                        f"Deleting {message} from {self.query_job_q_url}")
                    self.sqs_handler.delete_message_from_queue(
                        self.query_job_q_url, receipt_handle)
            else:
                logger.info(f"No messages to read from {self.query_job_q_url}")
コード例 #5
0
    def setUp(self, mock_get_datetime_now):
        super(TestRequestTracker, self).setUp()
        self.stub_date = '2019-03-18T180907.136216Z'
        mock_get_datetime_now.return_value = self.stub_date

        self.request_id = str(uuid.uuid4())
        self.request_tracker = RequestTracker(self.request_id)
        self.dynamo_handler = DynamoHandler()

        self.create_test_request_table()

        self.dynamo_handler.create_request_table_entry(self.request_id,
                                                       "test_format")
コード例 #6
0
def post_matrix(body: dict):

    feature = body.get("feature", constants.DEFAULT_FEATURE)
    fields = body.get("fields", constants.DEFAULT_FIELDS)
    format_ = body['format'] if 'format' in body else MatrixFormat.LOOM.value
    expected_formats = [mf.value for mf in MatrixFormat]

    # Validate input parameters
    if format_ not in expected_formats:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a valid `format`. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)
    if "filter" not in body:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a filter. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)

    if len(json.dumps(body["filter"])) > 128000:
        return ({
            'message':
            "The filter specification is too large. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.request_entity_too_large)

    request_id = str(uuid.uuid4())
    RequestTracker(request_id).initialize_request(format_, fields, feature)

    driver_payload = {
        'request_id': request_id,
        'filter': body["filter"],
        'fields': fields,
        'feature': feature
    }
    lambda_handler.invoke(LambdaName.DRIVER_V1, driver_payload)

    return ({
        'request_id': request_id,
        'status': MatrixRequestStatus.IN_PROGRESS.value,
        'matrix_url': "",
        'eta': "",
        'message': "Job started."
    }, requests.codes.accepted)
コード例 #7
0
    def setUp(self, mock_get_datetime_now):
        super(TestRequestTracker, self).setUp()
        self.stub_date = '2019-03-18T180907.136216Z'
        mock_get_datetime_now.return_value = self.stub_date

        self.request_id = str(uuid.uuid4())
        self.request_tracker = RequestTracker(self.request_id)
        self.dynamo_handler = DynamoHandler()

        self.create_test_data_version_table()
        self.create_test_deployment_table()
        self.create_test_request_table()
        self.create_s3_results_bucket()

        self.init_test_data_version_table()
        self.init_test_deployment_table()

        self.dynamo_handler.create_request_table_entry(
            self.request_id, "test_format", ["test_field_1", "test_field_2"],
            "test_feature")
コード例 #8
0
def get_matrix(request_id: str):

    # There are a few cases to handle here. First, if the request_id is not in
    # the state table at all, then this id has never been made and we should
    # 404.
    request_tracker = RequestTracker(request_id)
    if not request_tracker.is_initialized:
        return ({
            'message': f"Unable to find job with request ID {request_id}."
        }, requests.codes.not_found)

    in_progress_response = ({
        'request_id':
        request_id,
        'status':
        MatrixRequestStatus.IN_PROGRESS.value,
        'matrix_url':
        "",
        'eta':
        "",
        'message':
        f"Request {request_id} has been accepted and is currently being "
        f"processed. Please try again later."
    }, requests.codes.ok)

    # if the request tracker is not able to retrieve the format,
    # it means that the driver has not created the relevant entry in the output table yet.
    try:
        format = request_tracker.format
    except MatrixException:
        return in_progress_response

    # Failed case
    if request_tracker.error:
        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.FAILED.value,
            'matrix_url': "",
            'eta': "",
            'message': request_tracker.error
        }, requests.codes.ok)
    # Check for failed batch conversion job
    elif request_tracker.batch_job_status and request_tracker.batch_job_status == "FAILED":
        request_tracker.log_error(
            "The matrix conversion as a part of the request has failed. \
            Please retry or contact an hca admin for help.")
        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.FAILED.value,
            'matrix_url': "",
            'eta': "",
            'message': request_tracker.error
        }, requests.codes.ok)

    # Complete case
    elif request_tracker.is_request_complete():
        matrix_results_bucket = os.environ['MATRIX_RESULTS_BUCKET']
        matrix_results_handler = S3Handler(matrix_results_bucket)

        matrix_key = ""
        if format == MatrixFormat.LOOM.value:
            matrix_key = f"{request_tracker.s3_results_prefix}/{request_id}.{format}"
        elif format == MatrixFormat.CSV.value or format == MatrixFormat.MTX.value:
            matrix_key = f"{request_tracker.s3_results_prefix}/{request_id}.{format}.zip"

        matrix_location = f"https://s3.amazonaws.com/{matrix_results_bucket}/{matrix_key}"

        is_empty = False
        if not matrix_results_handler.size(matrix_key):
            is_empty = True
            matrix_location = ""

        if not is_empty:
            message = (
                f"Request {request_id} has successfully completed. "
                f"The resultant expression matrix is available for download at "
                f"{matrix_location}.")
        else:
            message = (
                f"Request {request_id} has successfully completed. "
                f"But, there were no cells associated with this request and "
                f"species {request_tracker.genus_species.value}")

        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.COMPLETE.value,
            'matrix_url': matrix_location,
            'eta': "",
            'message': message
        }, requests.codes.ok)

    # Expired case
    elif request_tracker.is_expired:
        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.EXPIRED.value,
            'matrix_url': "",
            'eta': "",
            'message': request_tracker.error
        }, requests.codes.ok)

    # Timeout case
    elif request_tracker.timeout:

        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.FAILED.value,
            'matrix_url': "",
            'eta': "",
            'message': request_tracker.error
        }, requests.codes.ok)
    else:
        return in_progress_response
コード例 #9
0
def post_matrix(body: dict):

    feature = body.get("feature", constants.DEFAULT_FEATURE)
    fields = body.get("fields", constants.DEFAULT_FIELDS)
    format_ = body['format'] if 'format' in body else MatrixFormat.LOOM.value
    expected_formats = [mf.value for mf in MatrixFormat]

    # Validate input parameters
    if format_ not in expected_formats:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a valid `format`. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)
    if "filter" not in body:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a filter. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)

    if len(json.dumps(body["filter"])) > 128000:
        return ({
            'message':
            "The filter specification is too large. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.request_entity_too_large)

    if query_constructor.has_genus_species_term(body["filter"]):
        # If the user has mentioned something about species, then maybe
        # they're looking for non-human data. So we'll run queries for all
        # the species that we know about.
        genera_species = list(constants.GenusSpecies)
    else:
        # Otherwise, default to human-only
        genera_species = [constants.GenusSpecies.HUMAN]

    # Cell barcode is required in .mtx matrices by 10x specifications
    if format_ == MatrixFormat.MTX.value and "cell.barcode" not in fields and "barcode" not in fields:
        fields.append("cell.barcode")

    human_request_id = ""
    non_human_request_ids = {}
    for genus_species in genera_species:
        request_id = str(uuid.uuid4())
        RequestTracker(request_id).initialize_request(format_, fields, feature,
                                                      genus_species)

        driver_payload = {
            'request_id': request_id,
            'filter': body["filter"],
            'fields': fields,
            'feature': feature,
            'genus_species': genus_species.value
        }
        lambda_handler.invoke(LambdaName.DRIVER_V1, driver_payload)

        if genus_species == GenusSpecies.HUMAN:
            human_request_id = request_id
        else:
            non_human_request_ids[genus_species.value] = request_id

    return ({
        'request_id': human_request_id,
        'non_human_request_ids': non_human_request_ids,
        'status': MatrixRequestStatus.IN_PROGRESS.value,
        'message': "Job started."
    }, requests.codes.accepted)
コード例 #10
0
    def test_is_initialized(self):
        self.assertTrue(self.request_tracker.is_initialized)

        new_request_tracker = RequestTracker("test_uuid")
        self.assertFalse(new_request_tracker.is_initialized)
コード例 #11
0
def post_matrix(body: dict):
    has_ids = 'bundle_fqids' in body
    has_url = 'bundle_fqids_url' in body

    format = body['format'] if 'format' in body else MatrixFormat.LOOM.value
    expected_formats = [mf.value for mf in MatrixFormat]

    # Validate input parameters
    if format not in expected_formats:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a valid `format`. "
            "Visit https://matrix.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)
    if has_ids and has_url:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply either one of `bundle_fqids` or `bundle_fqids_url`. "
            "Visit https://matrix.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)

    if not has_ids and not has_url:
        return ({
            'message':
            "Invalid parameters supplied. "
            "One of `bundle_fqids` or `bundle_fqids_url` must be supplied. "
            "Visit https://matrix.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)

    if not has_url and len(json.dumps(body['bundle_fqids'])) > 128000:
        return ({
            'message':
            "List of bundle fqids is too large. "
            "Consider using bundle_fqids_url instead. "
            "Visit https://matrix.data.humancellatlas.org for more information."
        }, requests.codes.request_entity_too_large)

    if has_url:
        bundle_fqids_url = body['bundle_fqids_url']
        bundle_fqids = None
    else:
        bundle_fqids = body['bundle_fqids']
        bundle_fqids_url = None
        if len(bundle_fqids) == 0:
            return ({
                'message':
                "Invalid parameters supplied. "
                "Please supply non empty `bundle_fqids`. "
                "Visit https://matrix.data.humancellatlas.org for more information."
            }, requests.codes.bad_request)

    request_id = str(uuid.uuid4())
    RequestTracker(request_id).initialize_request(format)
    driver_payload = {
        'request_id': request_id,
        'bundle_fqids': bundle_fqids,
        'bundle_fqids_url': bundle_fqids_url,
        'format': format,
    }
    lambda_handler.invoke(LambdaName.DRIVER_V0, driver_payload)

    return ({
        'request_id': request_id,
        'status': MatrixRequestStatus.IN_PROGRESS.value,
        'matrix_url': "",
        'eta': "",
        'message': "Job started."
    }, requests.codes.accepted)