Esempio n. 1
0
async def getRootInfo(app, root_id):
    """ Get extra information the root collection. """
    # Gather additional info on the domain
    log.debug(f"getRootInfo {root_id}")

    if not isSchema2Id(root_id):
        log.info(
            f"no dataset details not available for schema v1 id: {root_id} returning null results"
        )
        return None

    s3_key = getS3Key(root_id)

    parts = s3_key.split('/')
    # dset_key is in the format  db/<root>/d/<dset>/.dataset.json
    # get the key for the root info object as: db/<root>/.info.json
    if len(parts) != 3:
        log.error(f"Unexpected s3key format: {s3_key}")
        return None

    info_key = f"db/{parts[1]}/.info.json"

    try:
        info_json = await getS3JSONObj(app, info_key)
    except HTTPNotFound:
        log.warn(f"info.json not found for key: {info_key}")
        return None

    return info_json
Esempio n. 2
0
async def check_metadata_obj(app, obj_id):
    """ Return False is obj does not exist
    """
    if not isValidDomain(obj_id) and not isValidUuid(obj_id):
        msg = "Invalid obj id: {}".format(obj_id)
        log.error(msg)
        raise HTTPInternalServerError()

    try:
        validateInPartition(app, obj_id)
    except KeyError:
        log.error("Domain not in partition")
        raise HTTPInternalServerError()

    deleted_ids = app['deleted_ids']
    if obj_id in deleted_ids:
        msg = "{} has been deleted".format(obj_id)
        log.info(msg)
        return False

    meta_cache = app['meta_cache']
    if obj_id in meta_cache:
        found = True
    else:
        # Not in chache, check s3 obj exists
        s3_key = getS3Key(obj_id)
        log.debug("check_metadata_obj({})".format(s3_key))
        # does key exist?
        found = await isS3Obj(app, s3_key)
    return found
Esempio n. 3
0
async def check_metadata_obj(app, obj_id, bucket=None):
    """ Return False is obj does not exist
    """
    validateObjId(obj_id, bucket)
    if isValidDomain(obj_id):
        bucket = getBucketForDomain(obj_id)

    try:
        validateInPartition(app, obj_id)
    except KeyError:
        log.error("Domain not in partition")
        raise HTTPInternalServerError()

    deleted_ids = app['deleted_ids']
    if obj_id in deleted_ids:
        msg = f"{obj_id} has been deleted"
        log.info(msg)
        return False

    meta_cache = app['meta_cache']
    if obj_id in meta_cache:
        found = True
    else:
        # Not in chache, check s3 obj exists
        s3_key = getS3Key(obj_id)
        log.debug(f"check_metadata_obj({s3_key})")
        # does key exist?
        found = await isS3Obj(app, s3_key, bucket=bucket)
    return found
Esempio n. 4
0
async def removeKeys(app, objid):
    # iterate through all s3 keys under the given root or dataset id and delete them
    #
    # Note: not re-entrant!  Only one scanRoot an be run at a time per app.
    log.debug(f"removeKeys: {objid}")
    if not isSchema2Id(objid):
        log.warn("ignoring non-schema2 id")
        raise KeyError("Invalid key")
    s3key = getS3Key(objid)
    log.debug(f"got s3key: {s3key}")
    expected_suffixes = (".dataset.json", ".group.json")
    s3prefix = None

    for suffix in expected_suffixes:
        if s3key.endswith(suffix):
                s3prefix = s3key[:-len(suffix)]
    if not s3prefix:
        log.error("unexpected s3key for delete_set")
        raise KeyError("unexpected key suffix")
    log.info(f"delete for {objid} searching for s3prefix: {s3prefix}")
    if app["objDelete_prefix"]:
        log.error("objDelete_prefix is already set - impropere use of non-reentrant call?")
        # just continue and reset
    app["objDelete_prefix"] = s3prefix
    try:
        await getStorKeys(app, prefix=s3prefix, include_stats=False, callback=objDeleteCallback)
    except ClientError as ce:
        log.error(f"getS3Keys faiiled: {ce}")
    # reset the prefix
    app["objDelete_prefix"] = None
Esempio n. 5
0
async def DELETE_Chunk(request):
    """HTTP DELETE method for /chunks/
    Note: clients (i.e. SN nodes) don't directly delete chunks.  This method should
    only be called by the AN node.
    """
    log.request(request)
    app = request.app
    params = request.rel_url.query
    chunk_id = request.match_info.get('id')
    if not chunk_id:
        msg = "Missing chunk id"
        log.error(msg)
        raise HTTPBadRequest(reason=msg)
    log.info(f"DELETE chunk: {chunk_id}")

    if not isValidUuid(chunk_id, "Chunk"):
        msg = f"Invalid chunk id: {chunk_id}"
        log.warn(msg)
        raise HTTPBadRequest(reason=msg)
    if "bucket" in params:
        bucket = params["bucket"]
    else:
        bucket = None

    validateInPartition(app, chunk_id)

    chunk_cache = app['chunk_cache']
    s3key = getS3Key(chunk_id)
    log.debug(f"DELETE_Chunk s3_key: {s3key}")

    if chunk_id in chunk_cache:
        del chunk_cache[chunk_id]

    deflate_map = app["deflate_map"]
    shuffle_map = app["shuffle_map"]
    dset_id = getDatasetId(chunk_id)
    if dset_id in deflate_map:
        # The only reason chunks are ever deleted is if the dataset is being deleted,
        # so it should be safe to remove this entry now
        log.info(f"Removing deflate_map entry for {dset_id}")
        del deflate_map[dset_id]
    if dset_id in shuffle_map:
        log.info(f"Removing shuffle_map entry for {dset_id}")
        del shuffle_map[dset_id]

    if await isStorObj(app, s3key, bucket=bucket):
        await deleteStorObj(app, s3key, bucket=bucket)
    else:
        log.info(
            f"delete_metadata_obj - key {s3key} not found (never written)?")

    resp_json = {}
    resp = json_response(resp_json)
    log.response(request, resp=resp)
    return resp
Esempio n. 6
0
async def delete_metadata_obj(app,
                              obj_id,
                              notify=True,
                              root_id=None,
                              bucket=None):
    """ Delete the given object """
    meta_cache = app['meta_cache']
    dirty_ids = app["dirty_ids"]
    log.info(f"delete_meta_data_obj: {obj_id} notify: {notify}")
    validateObjId(obj_id, bucket)
    if isValidDomain(obj_id):
        bucket = getBucketForDomain(obj_id)

    try:
        validateInPartition(app, obj_id)
    except KeyError:
        log.error(f"obj: {obj_id} not in partition")
        raise HTTPInternalServerError()

    deleted_ids = app['deleted_ids']
    if obj_id in deleted_ids:
        log.warn(f"{obj_id} has already been deleted")
    else:
        log.debug(f"adding {obj_id} to deleted ids")
        deleted_ids.add(obj_id)

    if obj_id in meta_cache:
        log.debug(f"removing {obj_id} from meta_cache")
        del meta_cache[obj_id]

    if obj_id in dirty_ids:
        log.debug(f"removing dirty_ids for: {obj_id}")
        del dirty_ids[obj_id]

    # remove from S3 (if present)
    s3key = getS3Key(obj_id)

    if await isS3Obj(app, s3key, bucket=bucket):
        await deleteS3Obj(app, s3key, bucket=bucket)
    else:
        log.info(
            f"delete_metadata_obj - key {s3key} not found (never written)?")

    if isValidUuid(obj_id) and isSchema2Id(obj_id):
        if isRootObjId(obj_id):
            # add to gc ids so sub-objects will be deleted
            gc_ids = app["gc_ids"]
            log.info(f"adding root id: {obj_id} for GC cleanup")
            gc_ids.add(obj_id)
        elif notify:
            root_id = getRootObjId(obj_id)
            await notify_root(app, root_id, bucket=bucket)
        # no notify for domain deletes since the root group is being deleted

    log.debug(f"delete_metadata_obj for {obj_id} done")
Esempio n. 7
0
async def scanRoot(app, rootid, update=False, bucket=None):

    # iterate through all s3 keys under the given root.
    # Return dict with stats for the root.
    #
    # Note: not re-entrant!  Only one scanRoot an be run at a time per app.
    log.info(f"scanRoot for rootid: {rootid} bucket: {bucket}")

    if not isValidUuid(rootid):
        raise ValueError("Invalid root id")

    if not isSchema2Id(rootid):
        log.warn(f"no tabulation for schema v1 id: {rootid} returning null results")
        return {}

    if not bucket:
        bucket = config.get("bucket_name")
    if not bucket:
        raise ValueError(f"no bucket defined for scan of {rootid}")

    root_key = getS3Key(rootid)

    if not root_key.endswith("/.group.json"):
        raise ValueError("unexpected root key")
    root_prefix = root_key[:-(len(".group.json"))]

    log.debug(f"scanRoot - using prefix: {root_prefix}")

    results = {}
    results["lastModified"] = 0
    results["num_groups"] = 0
    results["num_datatypes"] = 0
    results["datasets"] = {}  # since we need per dataset info
    results["num_chunks"] = 0
    results["allocated_bytes"] = 0
    results["metadata_bytes"] = 0
    results["scan_start"] = time.time()

    app["scanRoot_results"] = results

    await getStorKeys(app, prefix=root_prefix, include_stats=True, bucket=bucket, callback=scanRootCallback)

    log.info(f"scan complete for rootid: {rootid}")
    results["scan_complete"] = time.time()

    if update:
        # write .info object back to S3
        info_key = root_prefix + ".info.json"
        log.info(f"updating info key: {info_key}")
        await putStorJSONObj(app, info_key, results, bucket=bucket)
    return results
Esempio n. 8
0
async def printS3Obj(app, obj_id):
    try:
        s3_key = getS3Key(obj_id)
        obj_exists = await isS3Obj(app, s3_key)
        if not obj_exists:
            print("s3 key: {} not found".format(s3_key))
            return
        json_obj = await getS3JSONObj(app, s3_key)
        print("s3key {}:".format(s3_key))
        print(json.dumps(json_obj, sort_keys=True, indent=4))
    except ValueError as ve:
        print("Got ValueError exception: {}".format(str(ve)))
    except ClientOSError as coe:
        print("Got S3 error: {}".format(str(coe)))
Esempio n. 9
0
async def printS3Obj(app, obj_id):
    try:
        s3_key = getS3Key(obj_id)
        obj_exists = await isStorObj(app, s3_key)
        if not obj_exists:
            print(f"key: {s3_key} not found")
            return
        json_obj = await getStorJSONObj(app, s3_key)
        print(f"s3key {s3_key}:")
        print(json.dumps(json_obj, sort_keys=True, indent=4))
    except ValueError as ve:
        print(f"Got ValueError exception: {ve}")
    except ClientOSError as coe:
        print(f"Got error: {coe}")
    await releaseStorageClient(app)
Esempio n. 10
0
async def createDomain(app, domain, domain_json):
    try:
        s3_key = getS3Key(domain)
        domain_exists = await isS3Obj(app, s3_key)
        if domain_exists:
            raise ValueError("Domain already exists")
        parent_domain = getParentDomain(domain)
        if parent_domain is None:
            raise ValueError("Domain must have a parent")
        
        log.info("writing domain")
        await putS3JSONObj(app, s3_key, domain_json)
        print("domain created!  s3_key: {}  domain_json: {}".format(s3_key, domain_json))
    except ValueError as ve:
        print("Got ValueError exception: {}".format(str(ve)))
    except ClientOSError as coe:
        print("Got S3 error: {}".format(str(coe)))  
Esempio n. 11
0
async def DELETE_Chunk(request):
    """HTTP DELETE method for /chunks/
    Note: clients (i.e. SN nodes) don't directly delete chunks.  This method should
    only be called by the AN node.
    """
    log.request(request)
    app = request.app
    chunk_id = request.match_info.get('id')
    if not chunk_id:
        msg = "Missing chunk id"
        log.error(msg)
        raise HTTPBadRequest(reason=msg)
    log.info("DELETE chunk: {}".format(chunk_id))

    if not isValidUuid(chunk_id, "Chunk"):
        msg = "Invalid chunk id: {}".format(chunk_id)
        log.warn(msg)
        raise HTTPBadRequest(reason=msg)

    validateInPartition(app, chunk_id)

    chunk_cache = app['chunk_cache'] 
    s3_key = getS3Key(chunk_id)
    log.debug("DELETE_Chunk s3_key: {}".format(s3_key))

    if chunk_id in chunk_cache:
        del chunk_cache[chunk_id]

    deflate_map = app["deflate_map"]
    dset_id = getDatasetId(chunk_id)
    if dset_id in deflate_map:
        # The only reason chunks are ever deleted is if the dataset is being deleted,
        # so it should be save to remove this entry now
        log.info("Removing deflate_map entry for {}".format(dset_id))
        del deflate_map[dset_id]

    resp_json = {  }   
    resp = json_response(resp_json)
    log.response(request, resp=resp)
    return resp
Esempio n. 12
0
async def s3sync(app):
    """ Periodic method that writes dirty objects in the metadata cache to S3"""
    MAX_PENDING_WRITE_REQUESTS = 20
    dirty_ids = app["dirty_ids"]
    pending_s3_write = app["pending_s3_write"]
    pending_s3_write_tasks = app["pending_s3_write_tasks"]
    s3_sync_interval = config.get("s3_sync_interval")
    dirty_count = len(dirty_ids)
    if not dirty_count:
        log.info("s3sync nothing to update")
        return 0

    log.info(
        f"s3sync update - dirtyid count: {dirty_count}, active write tasks: {len(pending_s3_write_tasks)}/{MAX_PENDING_WRITE_REQUESTS}"
    )
    log.debug(f"s3sync dirty_ids: {dirty_ids}")
    log.debug(f"sesync pending write s3keys: {list(pending_s3_write.keys())}")
    log.debug(f"s3sync write tasks: {list(pending_s3_write_tasks.keys())}")

    def callback(future):
        try:
            obj_id = future.result()  # returns a objid
            log.info(f"write_s3_obj callback result: {obj_id}")
        except HTTPInternalServerError as hse:
            log.error(f"write_s3_obj callback got 500: {hse}")
        except Exception as e:
            log.error(
                f"write_s3_obj callback unexpected exception {type(e)}: {e}")

    update_count = 0
    s3sync_start = time.time()

    log.info(f"s3sync - processing {len(dirty_ids)} dirty_ids")
    for obj_id in dirty_ids:
        item = dirty_ids[obj_id]
        log.debug(f"got item: {item} for obj_id: {obj_id}")
        bucket = item[1]
        if not bucket:
            if "bucket_name" in app and app["bucket_name"]:
                bucket = app["bucket_name"]
            else:
                log.error(
                    f"can not determine bucket for s3sync obj_id: {obj_id}")
                continue
        s3key = getS3Key(obj_id)
        log.debug(
            f"s3sync dirty id: {obj_id}, s3key: {s3key} bucket: {bucket}")

        create_task = True
        if s3key in pending_s3_write:
            log.debug(
                f"key {s3key} has been pending for {s3sync_start - pending_s3_write[s3key]}"
            )
            if s3sync_start - pending_s3_write[s3key] > s3_sync_interval * 2:
                log.warn(
                    f"obj {obj_id} has been in pending_s3_write for {s3sync_start - pending_s3_write[s3key]} seconds, restarting"
                )
                del pending_s3_write[s3key]
                if obj_id not in pending_s3_write_tasks:
                    log.error(f"Expected to find write task for {obj_id}")
                else:
                    task = pending_s3_write_tasks[obj_id]
                    task.cancel()
                    del pending_s3_write_tasks[obj_id]
            else:
                log.debug(f"key {s3key} has a pending write task")
                create_task = False
                if obj_id not in pending_s3_write_tasks:
                    log.error(
                        f"expected to find {obj_id} in pending_s3_write_tasks")
        if create_task:
            if len(pending_s3_write_tasks) < MAX_PENDING_WRITE_REQUESTS:
                # create a task to write this object
                log.debug(f"s3sync - ensure future for {obj_id}")
                task = asyncio.ensure_future(
                    write_s3_obj(app, obj_id, bucket=bucket))
                task.add_done_callback(callback)
                pending_s3_write_tasks[obj_id] = task
                update_count += 1
            else:
                log.debug(
                    f"s3sync - too many pending tasks, not creating task for: {obj_id} now"
                )

    # notify root of obj updates
    notify_ids = app["root_notify_ids"]
    if len(notify_ids) > 0:
        log.info(f"Notifying for {len(notify_ids)} S3 Updates")
        # create a set since we are not allowed to change
        root_ids = set()
        for root_id in notify_ids:
            root_ids.add(root_id)

        for root_id in root_ids:
            bucket = notify_ids[root_id]
            await notify_root(app, root_id, bucket=bucket)
            del notify_ids[root_id]
        log.info("root notify complete")

    # return number of objects written
    return update_count
Esempio n. 13
0
async def write_s3_obj(app, obj_id, bucket=None):
    """ writes the given object to s3 """
    s3key = getS3Key(obj_id)
    log.info(
        f"write_s3_obj for obj_id: {obj_id} / s3_key: {s3key}  bucket: {bucket}"
    )
    pending_s3_write = app["pending_s3_write"]
    pending_s3_write_tasks = app["pending_s3_write_tasks"]
    dirty_ids = app["dirty_ids"]
    chunk_cache = app['chunk_cache']
    meta_cache = app['meta_cache']
    deflate_map = app['deflate_map']
    shuffle_map = app['shuffle_map']
    notify_objs = app["root_notify_ids"]
    deleted_ids = app['deleted_ids']
    success = False

    if isValidDomain(obj_id):
        domain_bucket = getBucketForDomain(obj_id)
        if bucket and bucket != domain_bucket:
            log.error(
                f"expected bucket for domain: {obj_id} to match what wsas passed to write_s3_obj"
            )
        else:
            bucket = domain_bucket

    if s3key in pending_s3_write:
        msg = f"write_s3_key - not expected for key {s3key} to be in pending_s3_write map"
        log.error(msg)
        raise KeyError(msg)

    if obj_id not in pending_s3_write_tasks:
        # don't allow reentrant write
        log.debug(f"write_s3_obj for {obj_id} not s3sync task")

    if obj_id in deleted_ids and isValidUuid(obj_id):
        # if this objid has been deleted (and its unique since this is not a domain id)
        # cancel any pending task and return
        log.warn(f"Canceling write for {obj_id} since it has been deleted")
        if obj_id in pending_s3_write_tasks:
            log.info(f"removing pending s3 write task for {obj_id}")
            task = pending_s3_write_tasks[obj_id]
            task.cancel()
            del pending_s3_write_tasks[obj_id]
        return None

    now = time.time()

    last_update_time = now
    if obj_id in dirty_ids:
        last_update_time = dirty_ids[obj_id][
            0]  # timestamp is first element of two-tuple
    if last_update_time > now:
        msg = f"last_update time {last_update_time} is in the future for obj_id: {obj_id}"
        log.error(msg)
        raise ValueError(msg)

    pending_s3_write[s3key] = now
    # do the following in the try block so we can always remove the pending_s3_write at the end

    try:
        if isValidChunkId(obj_id):
            if obj_id not in chunk_cache:
                log.error(f"expected to find obj_id: {obj_id} in chunk cache")
                raise KeyError(f"{obj_id} not found in chunk cache")
            if not chunk_cache.isDirty(obj_id):
                log.error(f"expected chunk cache obj {obj_id} to be dirty")
                raise ValueError("bad dirty state for obj")
            chunk_arr = chunk_cache[obj_id]
            chunk_bytes = arrayToBytes(chunk_arr)
            dset_id = getDatasetId(obj_id)
            deflate_level = None
            shuffle = 0
            if dset_id in shuffle_map:
                shuffle = shuffle_map[dset_id]
            if dset_id in deflate_map:
                deflate_level = deflate_map[dset_id]
                log.debug(
                    f"got deflate_level: {deflate_level} for dset: {dset_id}")
            if dset_id in shuffle_map:
                shuffle = shuffle_map[dset_id]
                log.debug(f"got shuffle size: {shuffle} for dset: {dset_id}")

            await putS3Bytes(app,
                             s3key,
                             chunk_bytes,
                             shuffle=shuffle,
                             deflate_level=deflate_level,
                             bucket=bucket)
            success = True

            # if chunk has been evicted from cache something has gone wrong
            if obj_id not in chunk_cache:
                msg = f"expected to find {obj_id} in chunk_cache"
                log.error(msg)
            elif obj_id in dirty_ids and dirty_ids[obj_id][
                    0] > last_update_time:
                log.info(
                    f"write_s3_obj {obj_id} got updated while s3 write was in progress"
                )
            else:
                # no new write, can clear dirty
                chunk_cache.clearDirty(obj_id)  # allow eviction from cache
                log.debug(
                    "putS3Bytes Chunk cache utilization: {} per, dirty_count: {}"
                    .format(chunk_cache.cacheUtilizationPercent,
                            chunk_cache.dirtyCount))
        else:
            # meta data update
            # check for object in meta cache
            if obj_id not in meta_cache:
                log.error(f"expected to find obj_id: {obj_id} in meta cache")
                raise KeyError(f"{obj_id} not found in meta cache")
            if not meta_cache.isDirty(obj_id):
                log.error(f"expected meta cache obj {obj_id} to be dirty")
                raise ValueError("bad dirty state for obj")
            obj_json = meta_cache[obj_id]

            await putS3JSONObj(app, s3key, obj_json, bucket=bucket)
            success = True
            # should still be in meta_cache...
            if obj_id in deleted_ids:
                log.info(
                    f"obj {obj_id} has been deleted while write was in progress"
                )
            elif obj_id not in meta_cache:
                msg = f"expected to find {obj_id} in meta_cache"
                log.error(msg)
            elif obj_id in dirty_ids and dirty_ids[obj_id][
                    0] > last_update_time:
                log.info(
                    f"write_s3_obj {obj_id} got updated while s3 write was in progress"
                )
            else:
                meta_cache.clearDirty(obj_id)  # allow eviction from cache
    finally:
        # clear pending_s3_write item
        log.debug(f"write_s3_obj finally block, success={success}")
        if s3key not in pending_s3_write:
            msg = f"write s3 obj: Expected to find {s3key} in pending_s3_write map"
            log.error(msg)
        else:
            if pending_s3_write[s3key] != now:
                msg = f"pending_s3_write timestamp got updated unexpectedly for {s3key}"
                log.error(msg)
            del pending_s3_write[s3key]
        # clear task
        if obj_id not in pending_s3_write_tasks:
            log.debug(f"no pending s3 write task for {obj_id}")
        else:
            log.debug(f"removing pending s3 write task for {obj_id}")
            del pending_s3_write_tasks[obj_id]
        # clear dirty flag
        if obj_id in dirty_ids and dirty_ids[obj_id][0] == last_update_time:
            log.debug(f"clearing dirty flag for {obj_id}")
            del dirty_ids[obj_id]

    # add to map so that root can be notified about changed objects
    if isValidUuid(obj_id) and isSchema2Id(obj_id):
        root_id = getRootObjId(obj_id)
        notify_objs[root_id] = bucket

    # calculate time to do the write
    elapsed_time = time.time() - now
    log.info(f"s3 write for {s3key} took {elapsed_time:.3f}s")
    return obj_id
Esempio n. 14
0
async def get_metadata_obj(app, obj_id, bucket=None):
    """ Get object from metadata cache (if present).
        Otherwise fetch from S3 and add to cache
    """
    log.info(f"get_metadata_obj: {obj_id} bucket: {bucket}")
    validateObjId(obj_id, bucket)  # throws internal server error if invalid
    if isValidDomain(obj_id):
        bucket = getBucketForDomain(obj_id)
    """
    try:
        validateInPartition(app, obj_id)
    except KeyError:
        log.error("Domain not in partition")
        raise HTTPInternalServerError() 
    """

    deleted_ids = app['deleted_ids']
    if obj_id in deleted_ids:
        msg = f"{obj_id} has been deleted"
        log.warn(msg)
        raise HTTPGone()

    meta_cache = app['meta_cache']
    obj_json = None
    if obj_id in meta_cache:
        log.debug(f"{obj_id} found in meta cache")
        obj_json = meta_cache[obj_id]
    else:
        s3_key = getS3Key(obj_id)
        pending_s3_read = app["pending_s3_read"]
        if obj_id in pending_s3_read:
            # already a read in progress, wait for it to complete
            read_start_time = pending_s3_read[obj_id]
            log.info(
                f"s3 read request for {s3_key} was requested at: {read_start_time}"
            )
            while time.time() - read_start_time < 2.0:
                log.debug("waiting for pending s3 read, sleeping")
                await asyncio.sleep(1)  # sleep for sub-second?
                if obj_id in meta_cache:
                    log.info(f"object {obj_id} has arrived!")
                    obj_json = meta_cache[obj_id]
                    break
            if not obj_json:
                log.warn(
                    f"s3 read for object {s3_key} timed-out, initiaiting a new read"
                )

        # invoke S3 read unless the object has just come in from pending read
        if not obj_json:
            log.debug(f"getS3JSONObj({s3_key}, bucket={bucket})")
            if obj_id not in pending_s3_read:
                pending_s3_read[obj_id] = time.time()
            # read S3 object as JSON
            obj_json = await getS3JSONObj(app, s3_key, bucket=bucket)
            if obj_id in pending_s3_read:
                # read complete - remove from pending map
                elapsed_time = time.time() - pending_s3_read[obj_id]
                log.info(f"s3 read for {s3_key} took {elapsed_time}")
                del pending_s3_read[obj_id]
            meta_cache[obj_id] = obj_json  # add to cache
    return obj_json
Esempio n. 15
0
async def delete_metadata_obj(app, obj_id, notify=True, root_id=None):
    """ Delete the given object """
    meta_cache = app['meta_cache']
    dirty_ids = app["dirty_ids"]
    log.info("delete_meta_data_obj: {} notify: {}".format(obj_id, notify))
    if not isValidDomain(obj_id) and not isValidUuid(obj_id):
        msg = "Invalid obj id: {}".format(obj_id)
        log.error(msg)
        raise HTTPInternalServerError()

    try:
        validateInPartition(app, obj_id)
    except KeyError:
        log.error(f"obj: {obj_id} not in partition")
        raise HTTPInternalServerError()

    deleted_ids = app['deleted_ids']
    if obj_id in deleted_ids:
        log.warn("{} has already been deleted".format(obj_id))
    else:
        deleted_ids.add(obj_id)

    if obj_id in meta_cache:
        log.debug(f"removing {obj_id} from meta_cache")
        del meta_cache[obj_id]

    if obj_id in dirty_ids:
        del dirty_ids[obj_id]

    # remove from S3 (if present)
    s3key = getS3Key(obj_id)

    if await isS3Obj(app, s3key):
        await deleteS3Obj(app, s3key)
    else:
        log.info(
            f"delete_metadata_obj - key {s3key} not found (never written)?")

    if notify:
        an_url = getAsyncNodeUrl(app)
        if obj_id.startswith("/"):
            # domain delete
            req = an_url + "/domain"
            params = {"domain": obj_id}

            try:
                log.info("ASync DELETE notify: {} params: {}".format(
                    req, params))
                await http_delete(app, req, params=params)
            except ClientError as ce:
                log.error(f"got error notifying async node: {ce}")
            except HTTPInternalServerError as hse:
                log.error(f"got HTTPInternalServerError: {hse}")
        else:
            req = an_url + "/object/" + obj_id
            try:
                log.info(f"ASync DELETE notify: {req}")
                await http_delete(app, req)
            except ClientError as ce:
                log.error(f"got ClientError notifying async node: {ce}")
            except HTTPInternalServerError as ise:
                log.error(
                    f"got HTTPInternalServerError notifying async node: {ise}")
    log.debug(f"delete_metadata_obj for {obj_id} done")
Esempio n. 16
0
async def getChunk(app, chunk_id, dset_json, s3path=None, s3offset=0, s3size=0, chunk_init=False):
    # if the chunk cache has too many dirty items, wait till items get flushed to S3
    MAX_WAIT_TIME = 10.0  # TBD - make this a config
    chunk_cache = app['chunk_cache']
    if chunk_init and s3offset > 0:
        log.error(f"unable to initiale chunk {chunk_id} for reference layouts ")
        raise  HTTPInternalServerError()

    log.debug(f"getChunk cache utilization: {chunk_cache.cacheUtilizationPercent} per, dirty_count: {chunk_cache.dirtyCount}, mem_dirty: {chunk_cache.memDirty}")

    chunk_arr = None 
    dset_id = getDatasetId(chunk_id)
    dims = getChunkLayout(dset_json)
    type_json = dset_json["type"]
    dt = createDataType(type_json)

    bucket = None
    s3key = None

    if s3path:
        if not s3path.startswith("s3://"):
            # TBD - verify these at dataset creation time?
            log.error(f"unexpected s3path for getChunk: {s3path}")
            raise  HTTPInternalServerError()
        path = s3path[5:]
        index = path.find('/')   # split bucket and key
        if index < 1:
            log.error(f"s3path is invalid: {s3path}")
            raise HTTPInternalServerError()
        bucket = path[:index]
        log.debug(f"using bucket: {bucket}")
        s3key = path[(index+1):]
        log.debug(f"Using bucket: {bucket} and  s3key: {s3key}")
    else:
        s3key = getS3Key(chunk_id)

    log.debug("getChunk s3key: {}".format(s3key))
    if chunk_id in chunk_cache:
        chunk_arr = chunk_cache[chunk_id]
    else:
        if s3path and s3size == 0:
            obj_exists = False
        else:
            obj_exists = await isS3Obj(app, s3key, bucket=bucket)
        # TBD - potential race condition?
        if obj_exists:
            pending_s3_read = app["pending_s3_read"]
            if chunk_id in pending_s3_read:
                # already a read in progress, wait for it to complete
                read_start_time = pending_s3_read[chunk_id]
                log.info(f"s3 read request for {chunk_id} was requested at: {read_start_time}")
                while time.time() - read_start_time < 2.0:
                    log.debug("waiting for pending s3 read, sleeping")
                    await asyncio.sleep(1)  # sleep for sub-second?
                    if chunk_id in chunk_cache:
                        log.info(f"Chunk {chunk_id} has arrived!")
                        chunk_arr = chunk_cache[chunk_id]
                        break
                if chunk_arr is None:
                    log.warn(f"s3 read for chunk {chunk_id} timed-out, initiaiting a new read")
            
            if chunk_arr is None:
                if chunk_id not in pending_s3_read:
                    pending_s3_read[chunk_id] = time.time()
                log.debug("Reading chunk {} from S3".format(s3key))
                deflate_level = getDeflate(app, dset_id, dset_json)
                chunk_bytes = await getS3Bytes(app, s3key, deflate_level=deflate_level, s3offset=s3offset, s3size=s3size, bucket=bucket)
                if chunk_id in pending_s3_read:
                    # read complete - remove from pending map
                    elapsed_time = time.time() - pending_s3_read[chunk_id]
                    log.info(f"s3 read for {s3key} took {elapsed_time}")
                    del pending_s3_read[chunk_id] 
                else:
                    log.warn(f"expected to find {chunk_id} in pending_s3_read map")

            
            chunk_arr = np.fromstring(chunk_bytes, dtype=dt)
            chunk_arr = chunk_arr.reshape(dims)

            log.debug("chunk size: {}".format(chunk_arr.size)) 
           
        elif chunk_init:
            log.debug("Initializing chunk {chunk_id}")
            fill_value = getFillValue(dset_json)
            if fill_value:
                # need to convert list to tuples for numpy broadcast
                if isinstance(fill_value, list):
                    fill_value = tuple(fill_value)
                chunk_arr = np.empty(dims, dtype=dt, order='C')
                chunk_arr[...] = fill_value
            else:
                chunk_arr = np.zeros(dims, dtype=dt, order='C')
        else:
            log.debug(f"Chunk {chunk_id} not found")
            
        if chunk_arr is not None:
            # check that there's room in the cache before adding it
            if chunk_cache.memTarget - chunk_cache.memDirty < chunk_arr.size:
                # no room in the cache, wait till space is freed by the s3sync task
                wait_start = time.time()
                while chunk_cache.memTarget - chunk_cache.memDirty < chunk_arr.size:
                    log.warn(f"getChunk, cache utilization: {chunk_cache.cacheUtilizationPercent}, sleeping till items are flushed")
                    if time.time() - wait_start > MAX_WAIT_TIME:
                        log.error(f"unable to save updated chunk {chunk_id} to cache returning 503 error")
                        raise HTTPServiceUnavailable()
                    await asyncio.sleep(1)
                 
            chunk_cache[chunk_id] = chunk_arr  # store in cache
    return chunk_arr
Esempio n. 17
0
async def s3sync(app):
    """ Periodic method that writes dirty objects in the metadata cache to S3"""
    MAX_PENDING_WRITE_REQUESTS = 20
    dirty_ids = app["dirty_ids"]
    pending_s3_write = app["pending_s3_write"]
    pending_s3_write_tasks = app["pending_s3_write_tasks"]
    s3_sync_interval = config.get("s3_sync_interval")
    dirty_count = len(dirty_ids)
    if not dirty_count:
        log.info("s3sync nothing to update")
        return 0

    log.info(
        f"s3sync update - dirtyid count: {dirty_count}, active write tasks: {len(pending_s3_write_tasks)}/{MAX_PENDING_WRITE_REQUESTS}"
    )
    log.debug(f"s3sync dirty_ids: {dirty_ids}")
    log.debug(f"sesync pending write s3keys: {list(pending_s3_write.keys())}")
    log.debug(f"s3sync write tasks: {list(pending_s3_write_tasks.keys())}")

    def callback(future):
        try:
            obj_id = future.result()  # returns a objid
            log.info(f"write_s3_obj callback result: {obj_id}")
        except HTTPInternalServerError as hse:
            log.error(f"write_s3_obj callback got 500: {hse}")
        except Exception as e:
            log.error(f"write_s3_obj callback unexpected exception: {e}")

    update_count = 0
    s3sync_start = time.time()

    for obj_id in dirty_ids:
        s3key = getS3Key(obj_id)
        log.debug(f"s3sync dirty id: {obj_id}, s3key: {s3key}")
        create_task = True
        if s3key in pending_s3_write:
            log.debug(
                f"key {s3key} has been pending for {s3sync_start - pending_s3_write[s3key]}"
            )
            if s3sync_start - pending_s3_write[s3key] > s3_sync_interval * 2:
                log.warn(
                    f"obj {obj_id} has been in pending_s3_write for {s3sync_start - pending_s3_write[s3key]} seconds, restarting"
                )
                del pending_s3_write[s3key]
                if obj_id not in pending_s3_write_tasks:
                    log.error(f"Expected to find write task for {obj_id}")
                else:
                    task = pending_s3_write_tasks[obj_id]
                    task.cancel()
                    del pending_s3_write_tasks[obj_id]
            else:
                log.debug(f"key {s3key} has a pending write task")
                create_task = False
                if obj_id not in pending_s3_write_tasks:
                    log.error(
                        f"expected to find {obj_id} in pending_s3_write_tasks")
        if create_task and len(
                pending_s3_write_tasks) < MAX_PENDING_WRITE_REQUESTS:
            # create a task to write this object
            log.debug(f"s3sync - ensure future for {obj_id}")
            task = asyncio.ensure_future(write_s3_obj(app, obj_id))
            task.add_done_callback(callback)
            pending_s3_write_tasks[obj_id] = task
            update_count += 1

    # notify AN of key updates
    an_url = getAsyncNodeUrl(app)

    notify_objs = app["an_notify_objs"]
    if len(notify_objs) > 0:
        log.info(f"Notifying AN for {len(notify_objs)} S3 Updates")
        body = {"objs": list(notify_objs)}
        notify_objs.clear()

        req = an_url + "/objects"
        try:
            log.info("ASync PUT notify: {} body: {}".format(req, body))
            await http_put(app, req, data=body)
        except HTTPInternalServerError as hpe:
            msg = "got error notifying async node: {}".format(hpe)
            log.error(msg)

    # return number of objects written
    return update_count