Esempio n. 1
0
async def PUT_Object(request):
    """HTTP method to notify creation/update of objid"""
    log.request(request)
    app = request.app
    pending_set = app["pending"]
    objid = request.match_info.get('id')
    if not objid:
        log.error("PUT_Object with no id")
        raise HTTPBadRequest()

    log.info(f"PUT_Object/{objid}")
 
    if not isValidUuid(objid):
        log.warn(f"Invalid id: {objid}, ignoring")
        raise HTTPBadRequest()

    if isSchema2Id(objid):
        rootid = getRootObjId(objid)
        log.debug(f"adding root: {rootid} to pending queue for objid: {objid}")
        pending_set.add(rootid) 

    resp_json = {  } 
    resp = json_response(resp_json, status=201)
    log.response(request, resp=resp)
    return resp
Esempio n. 2
0
async def delete_metadata_obj(app,
                              obj_id,
                              notify=True,
                              root_id=None,
                              bucket=None):
    """ Delete the given object """
    meta_cache = app['meta_cache']
    dirty_ids = app["dirty_ids"]
    log.info(f"delete_meta_data_obj: {obj_id} notify: {notify}")
    validateObjId(obj_id, bucket)
    if isValidDomain(obj_id):
        bucket = getBucketForDomain(obj_id)

    try:
        validateInPartition(app, obj_id)
    except KeyError:
        log.error(f"obj: {obj_id} not in partition")
        raise HTTPInternalServerError()

    deleted_ids = app['deleted_ids']
    if obj_id in deleted_ids:
        log.warn(f"{obj_id} has already been deleted")
    else:
        log.debug(f"adding {obj_id} to deleted ids")
        deleted_ids.add(obj_id)

    if obj_id in meta_cache:
        log.debug(f"removing {obj_id} from meta_cache")
        del meta_cache[obj_id]

    if obj_id in dirty_ids:
        log.debug(f"removing dirty_ids for: {obj_id}")
        del dirty_ids[obj_id]

    # remove from S3 (if present)
    s3key = getS3Key(obj_id)

    if await isS3Obj(app, s3key, bucket=bucket):
        await deleteS3Obj(app, s3key, bucket=bucket)
    else:
        log.info(
            f"delete_metadata_obj - key {s3key} not found (never written)?")

    if isValidUuid(obj_id) and isSchema2Id(obj_id):
        if isRootObjId(obj_id):
            # add to gc ids so sub-objects will be deleted
            gc_ids = app["gc_ids"]
            log.info(f"adding root id: {obj_id} for GC cleanup")
            gc_ids.add(obj_id)
        elif notify:
            root_id = getRootObjId(obj_id)
            await notify_root(app, root_id, bucket=bucket)
        # no notify for domain deletes since the root group is being deleted

    log.debug(f"delete_metadata_obj for {obj_id} done")
Esempio n. 3
0
async def PUT_Objects(request):
    """HTTP method to notify creation/update of objid"""
    log.request(request)
    app = request.app
    pending_set = app["pending"]
    log.info("PUT_Objects")

    if not request.has_body:
        msg = "PUT objects with no body"
        log.warn(msg)
        raise HTTPBadRequest(reason=msg)

    body = await request.json()
    log.debug("Got PUT Objects body: {}".format(body))
    if "objs" not in body:
        msg = "expected to find objs key in body"
        log.warn(msg)
        raise HTTPBadRequest(reason=msg)
    objs = body["objs"]
    for objid in objs:
        log.debug("PUT_Objects, objid: {}".format(objid))
        if not isValidUuid(objid):
            log.warn(f"Invalid id: {objid}, ignoring")
            continue

        if not isSchema2Id(objid):
            log.info(f"PUT_Objects ignoring v1 id: {objid}")
            continue
        rootid = getRootObjId(objid)
        log.debug(f"adding root: {rootid} to pending queue for objid: {objid}")
        pending_set.add(rootid) 

    resp_json = {  } 
    resp = json_response(resp_json, status=201)
    log.response(request, resp=resp)
    return resp
Esempio n. 4
0
async def PUT_Group(request):
    """ Handler for PUT /groups"""
    """ Used to flush all objects under a root group to S3 """

    FLUSH_TIME_OUT = 10.0  # TBD make config
    FLUSH_SLEEP_INTERVAL = 0.1  # TBD make config
    log.request(request)
    app = request.app
    params = request.rel_url.query

    root_id = request.match_info.get('id')
    if "bucket" in params:
        bucket = params["bucket"]
    else:
        bucket = None
    log.info(f"PUT group (flush): {root_id}  bucket: {bucket}")
    # don't really need bucket param since the dirty ids know which bucket they should write too

    if not isValidUuid(root_id, obj_class="group"):
        log.error(f"Unexpected group_id: {root_id}")
        raise HTTPInternalServerError()

    schema2 = isSchema2Id(root_id)

    if schema2 and not isRootObjId(root_id):
        log.error(f"Expected root id for flush but got: {root_id}")
        raise HTTPInternalServerError()

    flush_start = time.time()
    flush_set = set()
    dirty_ids = app["dirty_ids"]

    for obj_id in dirty_ids:
        if schema2:
            if isValidUuid(obj_id) and getRootObjId(obj_id) == root_id:
                flush_set.add(obj_id)
        else:
            # for schema1 not easy to determine if a given id is in a domain,
            # so just wait on all of them
            flush_set.add(obj_id)

    log.debug(f"flushop - waiting on {len(flush_set)} items")
    while time.time() - flush_start < FLUSH_TIME_OUT:
        # check to see if the items in our flush set are still there

        remaining_set = set()
        for obj_id in flush_set:
            if not obj_id in dirty_ids:
                log.debug(f"flush - {obj_id} has been written")
            elif dirty_ids[obj_id][0] > flush_start:
                log.debug(
                    f"flush - {obj_id} has been updated after flush start")
            else:
                log.debug(f"flush - {obj_id} still pending")
                remaining_set.add(obj_id)
        flush_set = remaining_set
        if len(flush_set) == 0:
            log.debug("flush op - all objects have been written")
            break
        log.debug(
            f"flushop - {len(flush_set)} item remaining, sleeping for {FLUSH_SLEEP_INTERVAL}"
        )
        await asyncio.sleep(FLUSH_SLEEP_INTERVAL)

    if len(flush_set) > 0:
        log.warn(
            f"flushop - {len(flush_set)} items not updated after {FLUSH_TIME_OUT}"
        )
        raise HTTPServiceUnavailable()

    resp = json_response(None, status=204)  # NO Content response
    log.response(request, resp=resp)
    return resp
Esempio n. 5
0
async def write_s3_obj(app, obj_id, bucket=None):
    """ writes the given object to s3 """
    s3key = getS3Key(obj_id)
    log.info(
        f"write_s3_obj for obj_id: {obj_id} / s3_key: {s3key}  bucket: {bucket}"
    )
    pending_s3_write = app["pending_s3_write"]
    pending_s3_write_tasks = app["pending_s3_write_tasks"]
    dirty_ids = app["dirty_ids"]
    chunk_cache = app['chunk_cache']
    meta_cache = app['meta_cache']
    deflate_map = app['deflate_map']
    shuffle_map = app['shuffle_map']
    notify_objs = app["root_notify_ids"]
    deleted_ids = app['deleted_ids']
    success = False

    if isValidDomain(obj_id):
        domain_bucket = getBucketForDomain(obj_id)
        if bucket and bucket != domain_bucket:
            log.error(
                f"expected bucket for domain: {obj_id} to match what wsas passed to write_s3_obj"
            )
        else:
            bucket = domain_bucket

    if s3key in pending_s3_write:
        msg = f"write_s3_key - not expected for key {s3key} to be in pending_s3_write map"
        log.error(msg)
        raise KeyError(msg)

    if obj_id not in pending_s3_write_tasks:
        # don't allow reentrant write
        log.debug(f"write_s3_obj for {obj_id} not s3sync task")

    if obj_id in deleted_ids and isValidUuid(obj_id):
        # if this objid has been deleted (and its unique since this is not a domain id)
        # cancel any pending task and return
        log.warn(f"Canceling write for {obj_id} since it has been deleted")
        if obj_id in pending_s3_write_tasks:
            log.info(f"removing pending s3 write task for {obj_id}")
            task = pending_s3_write_tasks[obj_id]
            task.cancel()
            del pending_s3_write_tasks[obj_id]
        return None

    now = time.time()

    last_update_time = now
    if obj_id in dirty_ids:
        last_update_time = dirty_ids[obj_id][
            0]  # timestamp is first element of two-tuple
    if last_update_time > now:
        msg = f"last_update time {last_update_time} is in the future for obj_id: {obj_id}"
        log.error(msg)
        raise ValueError(msg)

    pending_s3_write[s3key] = now
    # do the following in the try block so we can always remove the pending_s3_write at the end

    try:
        if isValidChunkId(obj_id):
            if obj_id not in chunk_cache:
                log.error(f"expected to find obj_id: {obj_id} in chunk cache")
                raise KeyError(f"{obj_id} not found in chunk cache")
            if not chunk_cache.isDirty(obj_id):
                log.error(f"expected chunk cache obj {obj_id} to be dirty")
                raise ValueError("bad dirty state for obj")
            chunk_arr = chunk_cache[obj_id]
            chunk_bytes = arrayToBytes(chunk_arr)
            dset_id = getDatasetId(obj_id)
            deflate_level = None
            shuffle = 0
            if dset_id in shuffle_map:
                shuffle = shuffle_map[dset_id]
            if dset_id in deflate_map:
                deflate_level = deflate_map[dset_id]
                log.debug(
                    f"got deflate_level: {deflate_level} for dset: {dset_id}")
            if dset_id in shuffle_map:
                shuffle = shuffle_map[dset_id]
                log.debug(f"got shuffle size: {shuffle} for dset: {dset_id}")

            await putS3Bytes(app,
                             s3key,
                             chunk_bytes,
                             shuffle=shuffle,
                             deflate_level=deflate_level,
                             bucket=bucket)
            success = True

            # if chunk has been evicted from cache something has gone wrong
            if obj_id not in chunk_cache:
                msg = f"expected to find {obj_id} in chunk_cache"
                log.error(msg)
            elif obj_id in dirty_ids and dirty_ids[obj_id][
                    0] > last_update_time:
                log.info(
                    f"write_s3_obj {obj_id} got updated while s3 write was in progress"
                )
            else:
                # no new write, can clear dirty
                chunk_cache.clearDirty(obj_id)  # allow eviction from cache
                log.debug(
                    "putS3Bytes Chunk cache utilization: {} per, dirty_count: {}"
                    .format(chunk_cache.cacheUtilizationPercent,
                            chunk_cache.dirtyCount))
        else:
            # meta data update
            # check for object in meta cache
            if obj_id not in meta_cache:
                log.error(f"expected to find obj_id: {obj_id} in meta cache")
                raise KeyError(f"{obj_id} not found in meta cache")
            if not meta_cache.isDirty(obj_id):
                log.error(f"expected meta cache obj {obj_id} to be dirty")
                raise ValueError("bad dirty state for obj")
            obj_json = meta_cache[obj_id]

            await putS3JSONObj(app, s3key, obj_json, bucket=bucket)
            success = True
            # should still be in meta_cache...
            if obj_id in deleted_ids:
                log.info(
                    f"obj {obj_id} has been deleted while write was in progress"
                )
            elif obj_id not in meta_cache:
                msg = f"expected to find {obj_id} in meta_cache"
                log.error(msg)
            elif obj_id in dirty_ids and dirty_ids[obj_id][
                    0] > last_update_time:
                log.info(
                    f"write_s3_obj {obj_id} got updated while s3 write was in progress"
                )
            else:
                meta_cache.clearDirty(obj_id)  # allow eviction from cache
    finally:
        # clear pending_s3_write item
        log.debug(f"write_s3_obj finally block, success={success}")
        if s3key not in pending_s3_write:
            msg = f"write s3 obj: Expected to find {s3key} in pending_s3_write map"
            log.error(msg)
        else:
            if pending_s3_write[s3key] != now:
                msg = f"pending_s3_write timestamp got updated unexpectedly for {s3key}"
                log.error(msg)
            del pending_s3_write[s3key]
        # clear task
        if obj_id not in pending_s3_write_tasks:
            log.debug(f"no pending s3 write task for {obj_id}")
        else:
            log.debug(f"removing pending s3 write task for {obj_id}")
            del pending_s3_write_tasks[obj_id]
        # clear dirty flag
        if obj_id in dirty_ids and dirty_ids[obj_id][0] == last_update_time:
            log.debug(f"clearing dirty flag for {obj_id}")
            del dirty_ids[obj_id]

    # add to map so that root can be notified about changed objects
    if isValidUuid(obj_id) and isSchema2Id(obj_id):
        root_id = getRootObjId(obj_id)
        notify_objs[root_id] = bucket

    # calculate time to do the write
    elapsed_time = time.time() - now
    log.info(f"s3 write for {s3key} took {elapsed_time:.3f}s")
    return obj_id
Esempio n. 6
0
async def save_metadata_obj(app,
                            obj_id,
                            obj_json,
                            bucket=None,
                            notify=False,
                            flush=False):
    """ Persist the given object """
    log.info(
        f"save_metadata_obj {obj_id} bucket={bucket} notify={notify} flush={flush}"
    )
    if notify and not flush:
        log.error("notify not valid when flush is false")
        raise HTTPInternalServerError()

    validateObjId(obj_id, bucket)

    if not isinstance(obj_json, dict):
        log.error("Passed non-dict obj to save_metadata_obj")
        raise HTTPInternalServerError()

    try:
        validateInPartition(app, obj_id)
    except KeyError:
        log.error("Domain not in partition")
        raise HTTPInternalServerError()

    dirty_ids = app["dirty_ids"]
    deleted_ids = app['deleted_ids']
    if obj_id in deleted_ids:
        if isValidUuid(obj_id):
            # domain objects may be re-created, but shouldn't see repeats of
            # deleted uuids
            log.warn(f"{obj_id} has been deleted")
            raise HTTPInternalServerError()
        elif obj_id in deleted_ids:
            deleted_ids.remove(obj_id)  # un-gone the domain id

    # update meta cache
    meta_cache = app['meta_cache']
    log.debug(f"save: {obj_id} to cache")
    meta_cache[obj_id] = obj_json

    meta_cache.setDirty(obj_id)
    now = int(time.time())

    if flush:
        # write to S3 immediately
        if isValidChunkId(obj_id):
            log.warn("flush not supported for save_metadata_obj with chunks")
            raise HTTPBadRequest()
        try:
            await write_s3_obj(app, obj_id, bucket=bucket)
        except KeyError as ke:
            log.error(f"s3 sync got key error: {ke}")
            raise HTTPInternalServerError()
        except HTTPInternalServerError:
            log.warn(f" failed to write {obj_id}")
            raise  # re-throw
        if obj_id in dirty_ids:
            log.warn(
                f"save_metadata_obj flush - object {obj_id} is still dirty")
        # message AN immediately if notify flag is set
        # otherwise AN will be notified at next S3 sync
        if notify:
            if isValidUuid(obj_id) and isSchema2Id(obj_id):
                root_id = getRootObjId(obj_id)
                await notify_root(app, root_id, bucket=bucket)
    else:
        log.debug(f"setting dirty_ids[{obj_id}] = ({now}, {bucket})")
        if isValidUuid(obj_id) and not bucket:
            log.warn(f"bucket is not defined for save_metadata_obj: {obj_id}")
        dirty_ids[obj_id] = (now, bucket)