def main(): if len(sys.argv) == 1 or sys.argv[1] == "-h" or sys.argv[1] == "--help": printUsage() sys.exit(1) chunk_id = sys.argv[-1] if not isValidChunkId(chunk_id): print("Invalid chunk id") sys.exit(1) # we need to setup a asyncio loop to query s3 loop = asyncio.get_event_loop() session = get_session(loop=loop) app = {} app["session"] = session app['bucket_name'] = config.get("bucket_name") app['node_count'] = 1 app['node_number'] = 0 app['deleted_ids'] = set() app['meta_cache'] = {} app['pending_s3_read'] = {} app['meta_cache'] = LruCache(mem_target=1024 * 1024, chunk_cache=False) app['chunk_cache'] = LruCache(mem_target=64 * 1024 * 1024, chunk_cache=True) domain = config.get("domain") if not domain: printUsage() sys.exit(-1) print("got domain:", domain) loop.run_until_complete(printChunkValues(app, domain, chunk_id)) loop.close()
async def bucketCheck(app): """ Verify that contents of bucket are self-consistent """ now = int(time.time()) log.info("bucket check {}".format(unixTimeToUTC(now))) # do initial listKeys await listKeys(app) # clear used flags clearUsedFlags(app) # mark objs await markObjs(app) unlinked_count = 0 s3objs = app["s3objs"] for objid in s3objs: if isValidUuid(objid) and not isValidChunkId(objid): try: s3obj = await getS3Obj(app, objid) if s3obj.used is False: unlinked_count += 1 except HTTPInternalServerError as hpe: log.warn("got error retreiving {}: {}".format(objid, hpe.code)) domains = app["domains"] for domain in domains: print("domain:", domain) roots = app["roots"] for root in roots: print("root:", root) top_level_domains = [] for domain in domains: if domain[0] != '/': log.error("unexpected domain: {}".format(domain)) continue if domain[1:].find('/') == -1: top_level_domains.append(domain) print("top-level-domains:") for domain in top_level_domains: print(domain) print("=" * 80) print("total storage: {}".format(app["bytes_in_bucket"])) print("Num objects: {}".format(len(app["s3objs"]))) print("Num domains: {}".format(len(app["domains"]))) print("Num root groups: {}".format(len(app["roots"]))) print("Unlinked objects: {}".format(unlinked_count))
async def write_s3_obj(app, obj_id, bucket=None): """ writes the given object to s3 """ s3key = getS3Key(obj_id) log.info( f"write_s3_obj for obj_id: {obj_id} / s3_key: {s3key} bucket: {bucket}" ) pending_s3_write = app["pending_s3_write"] pending_s3_write_tasks = app["pending_s3_write_tasks"] dirty_ids = app["dirty_ids"] chunk_cache = app['chunk_cache'] meta_cache = app['meta_cache'] deflate_map = app['deflate_map'] shuffle_map = app['shuffle_map'] notify_objs = app["root_notify_ids"] deleted_ids = app['deleted_ids'] success = False if isValidDomain(obj_id): domain_bucket = getBucketForDomain(obj_id) if bucket and bucket != domain_bucket: log.error( f"expected bucket for domain: {obj_id} to match what wsas passed to write_s3_obj" ) else: bucket = domain_bucket if s3key in pending_s3_write: msg = f"write_s3_key - not expected for key {s3key} to be in pending_s3_write map" log.error(msg) raise KeyError(msg) if obj_id not in pending_s3_write_tasks: # don't allow reentrant write log.debug(f"write_s3_obj for {obj_id} not s3sync task") if obj_id in deleted_ids and isValidUuid(obj_id): # if this objid has been deleted (and its unique since this is not a domain id) # cancel any pending task and return log.warn(f"Canceling write for {obj_id} since it has been deleted") if obj_id in pending_s3_write_tasks: log.info(f"removing pending s3 write task for {obj_id}") task = pending_s3_write_tasks[obj_id] task.cancel() del pending_s3_write_tasks[obj_id] return None now = time.time() last_update_time = now if obj_id in dirty_ids: last_update_time = dirty_ids[obj_id][ 0] # timestamp is first element of two-tuple if last_update_time > now: msg = f"last_update time {last_update_time} is in the future for obj_id: {obj_id}" log.error(msg) raise ValueError(msg) pending_s3_write[s3key] = now # do the following in the try block so we can always remove the pending_s3_write at the end try: if isValidChunkId(obj_id): if obj_id not in chunk_cache: log.error(f"expected to find obj_id: {obj_id} in chunk cache") raise KeyError(f"{obj_id} not found in chunk cache") if not chunk_cache.isDirty(obj_id): log.error(f"expected chunk cache obj {obj_id} to be dirty") raise ValueError("bad dirty state for obj") chunk_arr = chunk_cache[obj_id] chunk_bytes = arrayToBytes(chunk_arr) dset_id = getDatasetId(obj_id) deflate_level = None shuffle = 0 if dset_id in shuffle_map: shuffle = shuffle_map[dset_id] if dset_id in deflate_map: deflate_level = deflate_map[dset_id] log.debug( f"got deflate_level: {deflate_level} for dset: {dset_id}") if dset_id in shuffle_map: shuffle = shuffle_map[dset_id] log.debug(f"got shuffle size: {shuffle} for dset: {dset_id}") await putS3Bytes(app, s3key, chunk_bytes, shuffle=shuffle, deflate_level=deflate_level, bucket=bucket) success = True # if chunk has been evicted from cache something has gone wrong if obj_id not in chunk_cache: msg = f"expected to find {obj_id} in chunk_cache" log.error(msg) elif obj_id in dirty_ids and dirty_ids[obj_id][ 0] > last_update_time: log.info( f"write_s3_obj {obj_id} got updated while s3 write was in progress" ) else: # no new write, can clear dirty chunk_cache.clearDirty(obj_id) # allow eviction from cache log.debug( "putS3Bytes Chunk cache utilization: {} per, dirty_count: {}" .format(chunk_cache.cacheUtilizationPercent, chunk_cache.dirtyCount)) else: # meta data update # check for object in meta cache if obj_id not in meta_cache: log.error(f"expected to find obj_id: {obj_id} in meta cache") raise KeyError(f"{obj_id} not found in meta cache") if not meta_cache.isDirty(obj_id): log.error(f"expected meta cache obj {obj_id} to be dirty") raise ValueError("bad dirty state for obj") obj_json = meta_cache[obj_id] await putS3JSONObj(app, s3key, obj_json, bucket=bucket) success = True # should still be in meta_cache... if obj_id in deleted_ids: log.info( f"obj {obj_id} has been deleted while write was in progress" ) elif obj_id not in meta_cache: msg = f"expected to find {obj_id} in meta_cache" log.error(msg) elif obj_id in dirty_ids and dirty_ids[obj_id][ 0] > last_update_time: log.info( f"write_s3_obj {obj_id} got updated while s3 write was in progress" ) else: meta_cache.clearDirty(obj_id) # allow eviction from cache finally: # clear pending_s3_write item log.debug(f"write_s3_obj finally block, success={success}") if s3key not in pending_s3_write: msg = f"write s3 obj: Expected to find {s3key} in pending_s3_write map" log.error(msg) else: if pending_s3_write[s3key] != now: msg = f"pending_s3_write timestamp got updated unexpectedly for {s3key}" log.error(msg) del pending_s3_write[s3key] # clear task if obj_id not in pending_s3_write_tasks: log.debug(f"no pending s3 write task for {obj_id}") else: log.debug(f"removing pending s3 write task for {obj_id}") del pending_s3_write_tasks[obj_id] # clear dirty flag if obj_id in dirty_ids and dirty_ids[obj_id][0] == last_update_time: log.debug(f"clearing dirty flag for {obj_id}") del dirty_ids[obj_id] # add to map so that root can be notified about changed objects if isValidUuid(obj_id) and isSchema2Id(obj_id): root_id = getRootObjId(obj_id) notify_objs[root_id] = bucket # calculate time to do the write elapsed_time = time.time() - now log.info(f"s3 write for {s3key} took {elapsed_time:.3f}s") return obj_id
async def save_metadata_obj(app, obj_id, obj_json, bucket=None, notify=False, flush=False): """ Persist the given object """ log.info( f"save_metadata_obj {obj_id} bucket={bucket} notify={notify} flush={flush}" ) if notify and not flush: log.error("notify not valid when flush is false") raise HTTPInternalServerError() validateObjId(obj_id, bucket) if not isinstance(obj_json, dict): log.error("Passed non-dict obj to save_metadata_obj") raise HTTPInternalServerError() try: validateInPartition(app, obj_id) except KeyError: log.error("Domain not in partition") raise HTTPInternalServerError() dirty_ids = app["dirty_ids"] deleted_ids = app['deleted_ids'] if obj_id in deleted_ids: if isValidUuid(obj_id): # domain objects may be re-created, but shouldn't see repeats of # deleted uuids log.warn(f"{obj_id} has been deleted") raise HTTPInternalServerError() elif obj_id in deleted_ids: deleted_ids.remove(obj_id) # un-gone the domain id # update meta cache meta_cache = app['meta_cache'] log.debug(f"save: {obj_id} to cache") meta_cache[obj_id] = obj_json meta_cache.setDirty(obj_id) now = int(time.time()) if flush: # write to S3 immediately if isValidChunkId(obj_id): log.warn("flush not supported for save_metadata_obj with chunks") raise HTTPBadRequest() try: await write_s3_obj(app, obj_id, bucket=bucket) except KeyError as ke: log.error(f"s3 sync got key error: {ke}") raise HTTPInternalServerError() except HTTPInternalServerError: log.warn(f" failed to write {obj_id}") raise # re-throw if obj_id in dirty_ids: log.warn( f"save_metadata_obj flush - object {obj_id} is still dirty") # message AN immediately if notify flag is set # otherwise AN will be notified at next S3 sync if notify: if isValidUuid(obj_id) and isSchema2Id(obj_id): root_id = getRootObjId(obj_id) await notify_root(app, root_id, bucket=bucket) else: log.debug(f"setting dirty_ids[{obj_id}] = ({now}, {bucket})") if isValidUuid(obj_id) and not bucket: log.warn(f"bucket is not defined for save_metadata_obj: {obj_id}") dirty_ids[obj_id] = (now, bucket)
def scanRootCallback(app, s3keys): log.debug(f"scanRootCallback, {len(s3keys)} items") if isinstance(s3keys, list): log.error("got list result for s3keys callback") raise ValueError("unexpected callback format") results = app["scanRoot_results"] if results: log.debug(f"previous scanRoot_results:".format(results)) for s3key in s3keys.keys(): if not isS3ObjKey(s3key): log.info(f"not s3obj key, ignoring: {s3key}") continue objid = getObjId(s3key) etag = None obj_size = None lastModified = None item = s3keys[s3key] if "ETag" in item: etag = item["ETag"] if "Size" in item: obj_size = item["Size"] if "LastModified" in item: lastModified = item["LastModified"] log.debug(f"{objid}: {etag} {obj_size} {lastModified}") if lastModified > results["lastModified"]: log.debug(f"changing lastModified from: {results['lastModified']} to {lastModified}") results["lastModified"] = lastModified is_chunk = False if isValidChunkId(objid): is_chunk = True results["num_chunks"] += 1 results["allocated_bytes"] += obj_size else: results["metadata_bytes"] += obj_size if is_chunk or getCollectionForId(objid) == "datasets": if is_chunk: dsetid = getDatasetId(objid) else: dsetid = objid datasets = results["datasets"] if dsetid not in datasets: dataset_info = {} dataset_info["lastModified"] = 0 dataset_info["num_chunks"] = 0 dataset_info["allocated_bytes"] = 0 datasets[dsetid] = dataset_info dataset_info = datasets[dsetid] if lastModified > dataset_info["lastModified"]: dataset_info["lastModified"] = lastModified if is_chunk: dataset_info["num_chunks"] += 1 dataset_info["allocated_bytes"] += obj_size elif getCollectionForId(objid) == "groups": results["num_groups"] += 1 elif getCollectionForId(objid) == "datatypes": results["num_datatypes"] += 1 else: log.error(f"Unexpected collection type for id: {objid}")
async def save_metadata_obj(app, obj_id, obj_json, notify=False, flush=False): """ Persist the given object """ log.info(f"save_metadata_obj {obj_id} notify={notify} flush={flush}") if notify and not flush: log.error("notify not valid when flush is false") raise HTTPInternalServerError() if not obj_id.startswith('/') and not isValidUuid(obj_id): msg = "Invalid obj id: {}".format(obj_id) log.error(msg) raise HTTPInternalServerError() if not isinstance(obj_json, dict): log.error("Passed non-dict obj to save_metadata_obj") raise HTTPInternalServerError() try: validateInPartition(app, obj_id) except KeyError: log.error("Domain not in partition") raise HTTPInternalServerError() dirty_ids = app["dirty_ids"] deleted_ids = app['deleted_ids'] if obj_id in deleted_ids: if isValidUuid(obj_id): # domain objects may be re-created, but shouldn't see repeats of # deleted uuids log.warn("{} has been deleted".format(obj_id)) raise HTTPInternalServerError() elif obj_id in deleted_ids: deleted_ids.remove(obj_id) # un-gone the domain id # update meta cache meta_cache = app['meta_cache'] log.debug("save: {} to cache".format(obj_id)) meta_cache[obj_id] = obj_json meta_cache.setDirty(obj_id) now = int(time.time()) if flush: # write to S3 immediately if isValidChunkId(obj_id): log.warn("flush not supported for save_metadata_obj with chunks") raise HTTPBadRequest() try: await write_s3_obj(app, obj_id) except KeyError as ke: log.error(f"s3 sync got key error: {ke}") raise HTTPInternalServerError() except HTTPInternalServerError: log.warn(f" failed to write {obj_id}") raise # re-throw if obj_id in dirty_ids: log.warn( f"save_metadata_obj flush - object {obj_id} is still dirty") else: # flag to write to S3 dirty_ids[obj_id] = now # message AN immediately if notify flag is set # otherwise AN will be notified at next S3 sync if notify: an_url = getAsyncNodeUrl(app) if obj_id.startswith("/"): # domain update req = an_url + "/domain" params = {"domain": obj_id} if "root" in obj_json: params["root"] = obj_json["root"] if "owner" in obj_json: params["owner"] = obj_json["owner"] try: log.info("ASync PUT notify: {} params: {}".format(req, params)) await http_put(app, req, params=params) except HTTPInternalServerError as hpe: log.error(f"got error notifying async node: {hpe}") log.error(msg) else: req = an_url + "/object/" + obj_id try: log.info("ASync PUT notify: {}".format(req)) await http_put(app, req) except HTTPInternalServerError: log.error(f"got error notifying async node")