async def getStorBytes(app, key, shuffle=0, deflate_level=None, offset=0, length=None, bucket=None): """ Get object identified by key and read as bytes """ client = _getStorageClient(app) if not bucket: bucket = app['bucket_name'] if key[0] == '/': key = key[1:] # no leading slash log.info(f"getStorBytes({bucket}/{key})") data = await client.get_object(bucket=bucket, key=key, offset=offset, length=length) if data and len(data) > 0: log.info(f"read: {len(data)} bytes for key: {key}") if deflate_level is not None: try: unzip_data = zlib.decompress(data) log.info(f"uncompressed to {len(unzip_data)} bytes") data = unzip_data except zlib.error as zlib_error: log.info(f"zlib_err: {zlib_error}") log.warn(f"unable to uncompress obj: {key}") if shuffle > 0: unshuffled = _unshuffle(shuffle, data) log.info(f"unshuffled to {len(unshuffled)} bytes") data = unshuffled return data
async def getObjectIdByPath(app, obj_id, h5path, bucket=None, refresh=False): """ Find the object at the provided h5path location. If not found raise 404 error. """ log.info( f"getObjectIdByPath obj_id: {obj_id} h5path: {h5path} refresh: {refresh}" ) if h5path.startswith("./"): h5path = h5path[2:] # treat as relative path links = h5path.split('/') for link in links: if not link: continue # skip empty link log.debug(f"getObjectIdByPath for objid: {obj_id} got link: {link}") if getCollectionForId(obj_id) != "groups": # not a group, so won't have links msg = f"h5path: {h5path} not found" log.warn(msg) raise HTTPNotFound() req = getDataNodeUrl(app, obj_id) req += "/groups/" + obj_id + "/links/" + link log.debug("get LINK: " + req) params = {} if bucket: params["bucket"] = bucket link_json = await http_get(app, req, params=params) log.debug("got link_json: " + str(link_json)) if link_json["class"] != 'H5L_TYPE_HARD': # don't follow soft/external links msg = f"h5path: {h5path} not found" log.warn(msg) raise HTTPInternalServerError() obj_id = link_json["id"] # if we get here, we've traveresed the entire path and found the object return obj_id
async def GET_Attribute(request): """HTTP GET method to return JSON for /(obj)/<id>/attributes/<name> """ log.request(request) app = request.app params = request.rel_url.query obj_id = get_obj_id(request) attr_name = request.match_info.get('name') validateAttributeName(attr_name) if "bucket" in params: bucket = params["bucket"] else: bucket = None obj_json = await get_metadata_obj(app, obj_id, bucket=bucket) log.info(f"GET attribute obj_id: {obj_id} name: {attr_name} bucket: {bucket}") log.debug(f"got obj_json: {obj_json}") if "attributes" not in obj_json: log.error(f"unexpected obj data for id: {obj_id}") raise HTTPInternalServerError() attributes = obj_json["attributes"] if attr_name not in attributes: msg = f"Attribute '{attr_name}' not found for id: {obj_id}" log.warn(msg) raise HTTPNotFound() attr_json = attributes[attr_name] resp = json_response(attr_json) log.response(request, resp=resp) return resp
async def nodestate(request): """HTTP method to return information about registed nodes""" log.request(request) node_type = request.match_info.get('nodetype', '*') node_number = '*' if node_type != '*': node_number = request.match_info.get('nodenumber', '*') log.info("nodestate/{}/{}".format(node_type, node_number)) if node_type not in ("sn", "dn", "*"): msg = "invalid node_type" log.response(request, code=400, message=msg) raise HTTPBadRequest(reason=msg) app = request.app resp = StreamResponse() resp.headers['Content-Type'] = 'application/json' if node_number == '*': nodes = [] for node in app["nodes"]: if node["node_type"] == node_type or node_type == "*": nodes.append(node) answer = {"nodes": nodes} else: answer = {} for node in app["nodes"]: if node["node_type"] == node_type and str( node["node_number"]) == node_number: answer = node break answer["cluster_state"] = app["cluster_state"] resp = json_response(answer) log.response(request, resp=resp) return resp
def callback(future): try: obj_id = future.result() # returns a objid log.info(f"write_s3_obj callback result: {obj_id}") except HTTPInternalServerError as hse: log.error(f"write_s3_obj callback got 500: {hse}") except Exception as e: log.error(f"write_s3_obj callback unexpected exception: {e}")
async def releaseClient(self): """ release the client collection to s3 (Used for cleanup on application exit) """ log.info("release S3Client") if 's3' in self._app: client = self._app['s3'] await client.close() del self._app['s3']
async def PUT_Domain(request): """HTTP PUT method to create a domain """ log.request(request) app = request.app if not request.has_body: msg = "Expected body in put domain" log.error(msg) raise HTTPInternalServerError() body = await request.json() log.debug(f"got body: {body}") domain = get_domain(request, body=body) log.debug(f"PUT domain: {domain}") bucket = getBucketForDomain(domain) if not bucket: log.error(f"expected bucket to be used in domain: {domain}") raise HTTPInternalServerError() body_json = await request.json() if "owner" not in body_json: msg = "Expected Owner Key in Body" log.warn(msg) raise HTTPInternalServerError() if "acls" not in body_json: msg = "Expected Owner Key in Body" log.warn(msg) raise HTTPInternalServerError() # try getting the domain, should raise 404 domain_exists = await check_metadata_obj(app, domain) if domain_exists: # domain already exists msg = "Conflict: resource exists: " + domain log.info(msg) raise HTTPConflict() domain_json = {} if "root" in body_json: domain_json["root"] = body_json["root"] else: log.info("no root id, creating folder") domain_json["owner"] = body_json["owner"] domain_json["acls"] = body_json["acls"] now = time.time() domain_json["created"] = now domain_json["lastModified"] = now # write the domain json to S3 immediately so it will show up in a get_domains S3 scan await save_metadata_obj(app, domain, domain_json, notify=True, flush=True) resp = json_response(domain_json, status=201) log.response(request, resp=resp) return resp
def __init__(self, app, root_id, include_attrs=True, max_tasks=40): log.info(f"DomainCrawler.__init__ root_id: {root_id}") self._app = app self._include_attrs = include_attrs self._max_tasks = max_tasks self._q = asyncio.Queue() self._obj_dict = {} self.seen_ids = set() self._q.put_nowait(root_id)
async def releaseClient(self): """ release the client collection to Azure Blob Storage (Used for cleanup on application exit) """ log.info("release AzureBlobClient") if 'azureBlobClient' in self._app: client = self._app['azureBlobClient'] await client.close() del self._app['azureBlobClient']
async def list_keys(self, prefix='', deliminator='', suffix='', include_stats=False, callback=None, bucket=None, limit=None): """ return keys matching the arguments """ if not bucket: log.error("putt_object - bucket not set") raise HTTPInternalServerError() log.info( f"list_keys('{prefix}','{deliminator}','{suffix}', include_stats={include_stats}" ) buckets = self._client if bucket not in buckets: return [] bucket_map = buckets[bucket] key_set = set() for key in bucket_map: if prefix and not key.startswith(prefix): continue # skip any keys without the prefix if deliminator: index = key[len(prefix):].find(deliminator) if index > 0: num_chars = index + len(prefix) key = key[:num_chars] key_set.add(key) key_list = list(key_set) key_list.sort() if limit and len(key_list) > limit: key_list = key_list[:limit] if include_stats: now = time.time() # add ETag, modified time, and size to each item items = {} for key in key_list: item = {"ETag": "ABCD", "LastModified": now} if key in bucket_map: obj_size = len(bucket_map[key]) else: obj_size = 0 item["Size"] = obj_size items[key] = item else: # just return the list items = key_list log.info(f"getS3Keys done, got {len(items)} keys") return items
def __init__(self, app, domains, bucket=None, verbose=False, max_tasks=40): log.info(f"FolderCrawler.__init__ ") self._app = app self._verbose = verbose self._max_tasks = max_tasks self._q = asyncio.Queue() self._domain_dict = {} for domain in domains: self._q.put_nowait(domain) self._bucket = bucket
def getAcceptType(request): accept_type = "json" # default to JSON if "accept" in request.headers: # treat everything as json unless octet-stream is given if request.headers["accept"] != "application/octet-stream": msg = "Ignoring accept value: {}".format(request.headers["accept"]) log.info(msg) else: accept_type = "binary" return accept_type
async def crawl(self): workers = [asyncio.Task(self.work()) for _ in range(self._max_tasks)] # When all work is done, exit. log.info("DomainCrawler - await queue.join") await self._q.join() log.info("DomainCrawler - join complete") for w in workers: w.cancel() log.debug("DomainCrawler - workers canceled")
async def DELETE_Attribute(request): """HTTP method to delete a attribute resource""" log.request(request) app = request.app collection = getRequestCollectionName( request) # returns datasets|groups|datatypes obj_id = request.match_info.get('id') if not obj_id: msg = "Missing object id" log.warn(msg) raise HTTPBadRequest(reason=msg) if not isValidUuid(obj_id, obj_class=collection): msg = f"Invalid object id: {obj_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) attr_name = request.match_info.get('name') log.debug(f"Attribute name: [{attr_name}]") validateAttributeName(attr_name) username, pswd = getUserPasswordFromRequest(request) await validateUserPassword(app, username, pswd) domain = getDomainFromRequest(request) if not isValidDomain(domain): msg = f"Invalid domain: {domain}" log.warn(msg) raise HTTPBadRequest(reason=msg) bucket = getBucketForDomain(domain) # get domain JSON domain_json = await getDomainJson(app, domain) if "root" not in domain_json: log.error(f"Expected root key for domain: {domain}") raise HTTPBadRequest(reason="Unexpected Error") # TBD - verify that the obj_id belongs to the given domain await validateAction(app, domain, obj_id, username, "delete") req = getDataNodeUrl(app, obj_id) req += '/' + collection + '/' + obj_id + "/attributes/" + attr_name log.info("PUT Attribute: " + req) params = {} if bucket: params["bucket"] = bucket rsp_json = await http_delete(app, req, params=params) log.info(f"PUT Attribute resp: {rsp_json}") hrefs = [] # TBD req_rsp = {"hrefs": hrefs} resp = await jsonResponse(request, req_rsp) log.response(request, resp=resp) return resp
async def PUT_ACL(request): """HTTP method to add a new ACL for a domain""" log.request(request) app = request.app acl_username = request.match_info.get('username') if not acl_username: msg = "Missing username for ACL" log.warn(msg) raise HTTPBadRequest(reason=msg) (username, pswd) = getUserPasswordFromRequest(request) await validateUserPassword(app, username, pswd) if not request.has_body: msg = "PUT ACL with no body" log.warn(msg) raise HTTPBadRequest(reason=msg) body = await request.json() acl_keys = getAclKeys() for k in body.keys(): if k not in acl_keys: msg = "Unexpected key in request body: {}".format(k) log.warn(msg) raise HTTPBadRequest(reason=msg) if body[k] not in (True, False): msg = "Unexpected value for key in request body: {}".format(k) log.warn(k) raise HTTPBadRequest(reason=msg) try: domain = getDomainFromRequest(request) except ValueError: msg = "Invalid domain" log.warn(msg) raise HTTPBadRequest(reason=msg) # don't use app["domain_cache"] if a direct domain request is made # as opposed to an implicit request as with other operations, query # the domain from the authoritative source (the dn node) req = getDataNodeUrl(app, domain) req += "/acls/" + acl_username log.info("sending dn req: {}".format(req)) body["domain"] = domain put_rsp = await http_put(app, req, data=body) log.info("PUT ACL resp: " + str(put_rsp)) # ACL update successful resp = await jsonResponse(request, put_rsp, status=201) log.response(request, resp=resp) return resp
async def getDomainObjects(app, root_id, include_attrs=False): """ Iterate through all objects in heirarchy and add to obj_dict keyed by obj id """ log.info(f"getDomainObjects for root: {root_id}") crawler = DomainCrawler(app, root_id, include_attrs=include_attrs) await crawler.crawl() log.info(f"getDomainObjects returning: {len(crawler._obj_dict)} objects") return crawler._obj_dict
def validatePasswordSHA512(app, username, password): user_db = app["user_db"] if username not in user_db: log.info("SHA512 check for username: {}".format(username)) salt = config.get("PASSWORD_SALT") hex_hash = hashlib.sha512(username.encode('utf-8') + salt.encode('utf-8')).hexdigest() if hex_hash[:32] != password: log.warn("user password is not valid (didn't equal sha512 hash) for user: {}".format(username)) raise HTTPUnauthorized() # 401 log.info("Saving user/password to user_db for: {}".format(username)) user_data = {"pwd": password} user_db[username] = user_data
async def bucketCheck(app): """ Verify that contents of bucket are self-consistent """ now = int(time.time()) log.info("bucket check {}".format(unixTimeToUTC(now))) # do initial listKeys await listKeys(app) # clear used flags clearUsedFlags(app) # mark objs await markObjs(app) unlinked_count = 0 s3objs = app["s3objs"] for objid in s3objs: if isValidUuid(objid) and not isValidChunkId(objid): try: s3obj = await getS3Obj(app, objid) if s3obj.used is False: unlinked_count += 1 except HTTPInternalServerError as hpe: log.warn("got error retreiving {}: {}".format(objid, hpe.code)) domains = app["domains"] for domain in domains: print("domain:", domain) roots = app["roots"] for root in roots: print("root:", root) top_level_domains = [] for domain in domains: if domain[0] != '/': log.error("unexpected domain: {}".format(domain)) continue if domain[1:].find('/') == -1: top_level_domains.append(domain) print("top-level-domains:") for domain in top_level_domains: print(domain) print("=" * 80) print("total storage: {}".format(app["bytes_in_bucket"])) print("Num objects: {}".format(len(app["s3objs"]))) print("Num domains: {}".format(len(app["domains"]))) print("Num root groups: {}".format(len(app["roots"]))) print("Unlinked objects: {}".format(unlinked_count))
async def PUT_ACL(request): """ Handler creating/update an ACL""" log.request(request) app = request.app acl_username = request.match_info.get('username') if not request.has_body: msg = "Expected body in delete domain" log.error(msg) raise HTTPInternalServerError() body_json = await request.json() domain = get_domain(request, body=body_json) log.info(f"put_acl - domain: {domain}, username: {acl_username}") # raises exception if domain not found domain_json = await get_metadata_obj(app, domain) if "acls" not in domain_json: log.error(f"unexpected domain data for domain: {domain}") raise HTTPInternalServerError() # 500 acl_keys = getAclKeys() acls = domain_json["acls"] acl = {} if acl_username in acls: acl = acls[acl_username] else: # initialize acl with no perms for k in acl_keys: acl[k] = False # replace any permissions given in the body for k in body_json.keys(): acl[k] = body_json[k] # replace/insert the updated/new acl acls[acl_username] = acl # update the timestamp now = time.time() domain_json["lastModified"] = now # write back to S3 await save_metadata_obj(app, domain, domain_json, flush=True) resp_json = {} resp = json_response(resp_json, status=201) log.response(request, resp=resp) return resp
async def scanRootKeys(app, update=False): # iterate through all s3 root keys in the bucket. # # Note: not re-entrant! Only one scanRoot an be run at a time per app. log.info("scanRootKeys") app["scanRootKeys_update"] = update await getS3Keys(app, prefix="db/", deliminator='/', include_stats=False, callback=getS3RootKeysCallback)
async def getStorKeys(app, prefix='', deliminator='', suffix='', include_stats=False, callback=None, bucket=None, limit=None): # return keys matching the arguments client = _getStorageClient(app) if not bucket: bucket = app['bucket_name'] log.info(f"getStorKeys('{prefix}','{deliminator}','{suffix}', include_stats={include_stats}") key_names = await client.list_keys(prefix=prefix, deliminator=deliminator, suffix=suffix, include_stats=include_stats, callback=callback, bucket=bucket, limit=limit) log.info(f"getStorKeys done, got {len(key_names)} keys") return key_names
async def notify_root(app, root_id, bucket=None): # flag to write to S3 log.info(f"notify_root: {root_id}") if not isValidUuid(root_id) or not isSchema2Id(root_id): log.error(f"unexpected call to notify with invalid id: {root_id}") return notify_req = getDataNodeUrl(app, root_id) + "/roots/" + root_id log.info(f"Notify: {notify_req} [{bucket}]") params = {} if bucket: params["bucket"] = bucket await http_post(app, notify_req, data={}, params=params)
def initUserDB(app): """ Called at startup to initialize user/passwd dictionary from a password text file """ log.info("initUserDB") if "user_db" in app: msg = "user_db already initilized" log.warn(msg) return if config.get("AWS_DYNAMODB_GATEWAY") and config.get( "AWS_DYNAMODB_USERS_TABLE"): # user entries will be obtained dynamicaly log.info("Getting DynamoDB client") getDynamoDBClient( app) # get client here so any errors will be seen right away user_db = {} elif config.get("PASSWORD_SALT"): # use salt key to verify passwords user_db = {} else: password_file = config.get("password_file") log.info("Loading password file: {}".format(password_file)) user_db = loadPasswordFile(password_file) app["user_db"] = user_db log.info("user_db initialized: {} users".format(len(user_db)))
async def deleteStorObj(app, key, bucket=None): """ Delete storage object identfied by given key """ client = _getStorageClient(app) if not bucket: bucket = app['bucket_name'] if key[0] == '/': key = key[1:] # no leading slash log.info(f"deleteStorObj({key})") await client.delete_object(key, bucket=bucket) log.debug("deleteStorObj complete")
async def http_post(app, url, data=None, params=None): log.info("http_post('{}', data)".format(url, data)) client = get_http_client(app) rsp_json = None timeout = config.get("timeout") try: async with client.post(url, json=data, params=params, timeout=timeout) as rsp: log.info("http_post status: {}".format(rsp.status)) if rsp.status == 200: pass # ok elif rsp.status == 201: pass # also ok elif rsp.status == 204: # no data return None elif rsp.status == 404: log.info(f"POST reqest HTTPNotFound error for url: {url}") elif rsp.status == 410: log.info(f"POST reqest HTTPGone error for url: {url}") else: log.warn( f"POST request error for url: {url} - status: {rsp.status}" ) raise HTTPInternalServerError() rsp_json = await rsp.json() log.debug("http_post({}) response: {}".format(url, rsp_json)) except ClientError as ce: log.error("Error for http_post({}): {} ".format(url, str(ce))) raise HTTPInternalServerError() except CancelledError as cle: log.error(f"CancelledError for http_post({url}): {cle}") raise HTTPInternalServerError() return rsp_json
async def http_put(app, url, data=None, params=None): log.info("http_put('{}', data: {})".format(url, data)) rsp = None client = get_http_client(app) timeout = config.get("timeout") try: async with client.put(url, json=data, params=params, timeout=timeout) as rsp: log.info("http_put status: {}".format(rsp.status)) if rsp.status == 201: pass # expected elif rsp.status == 404: # can come up for replace ops log.info(f"HTTPNotFound for: {url}") elif rsp.status == 409: log.info(f"HTTPConflict for: {url}") raise HTTPConflict() elif rsp.status == 503: log.warn(f"503 error for http_put url: {url}") raise HTTPServiceUnavailable() else: log.error( f"PUT request error for url: {url} - status: {rsp.status}") raise HTTPInternalServerError() rsp_json = await rsp.json() log.debug("http_put({}) response: {}".format(url, rsp_json)) except ClientError as ce: log.error(f"ClientError for http_put({url}): {ce} ") raise HTTPInternalServerError() except CancelledError as cle: log.error(f"CancelledError for http_put({url}): {cle}") raise HTTPInternalServerError() return rsp_json
async def putStorBytes(app, key, data, shuffle=0, deflate_level=None, bucket=None): """ Store byte string as S3 object with given key """ client = _getStorageClient(app) if not bucket: bucket = app['bucket_name'] if key[0] == '/': key = key[1:] # no leading slash log.info(f"putStorBytes({bucket}/{key}), {len(data)} bytes shuffle: {shuffle} deflate: {deflate_level}") if shuffle > 0: shuffled_data = _shuffle(shuffle, data) log.info(f"shuffled data to {len(shuffled_data)}") data = shuffled_data if deflate_level is not None: try: # the keyword parameter is enabled with py3.6 # zip_data = zlib.compress(data, level=deflate_level) zip_data = zlib.compress(data, deflate_level) log.info(f"compressed from {len(data)} bytes to {len(zip_data)} bytes with level: {deflate_level}") data = zip_data except zlib.error as zlib_error: log.info(f"zlib_err: {zlib_error}") log.warn(f"unable to compress obj: {key}, using raw bytes") rsp = await client.put_object(key, data, bucket=bucket) return rsp
async def DELETE_Datatype(request): """HTTP DELETE method for datatype """ log.request(request) app = request.app params = request.rel_url.query ctype_id = get_obj_id(request) log.info(f"DELETE ctype: {ctype_id}") if "bucket" in params: bucket = params["bucket"] else: bucket = None # verify the id exist obj_found = await check_metadata_obj(app, ctype_id) if not obj_found: log.warn(f"Delete on non-existent obj: {ctype_id}") raise HTTPNotFound log.info("deleting ctype: {}".format(ctype_id)) notify=True if "Notify" in params and not params["Notify"]: log.info("notify value: {}".format(params["Notify"])) notify=False log.info("notify: {}".format(notify)) await delete_metadata_obj(app, ctype_id, bucket=bucket, notify=notify) resp_json = { } resp = json_response(resp_json) log.response(request, resp=resp) return resp
async def DELETE_Link(request): """HTTP DELETE method for group links """ log.request(request) app = request.app params = request.rel_url.query group_id = get_obj_id(request) log.info(f"DELETE link: {group_id}") if not isValidUuid(group_id, obj_class="group"): msg = f"Unexpected group_id: {group_id}" log.warn(msg) raise HTTPBadRequest(reason=msg) link_title = request.match_info.get('title') validateLinkName(link_title) if "bucket" in params: bucket = params["bucket"] else: bucket = None group_json = await get_metadata_obj(app, group_id, bucket=bucket) # TBD: Possible race condition if "links" not in group_json: log.error(f"unexpected group data for id: {group_id}") raise HTTPInternalServerError() links = group_json["links"] if link_title not in links: msg = f"Link name {link_title} not found in group: {group_id}" log.warn(msg) raise HTTPNotFound() del links[link_title] # remove the link from dictionary # update the group lastModified now = time.time() group_json["lastModified"] = now # write back to S3 await save_metadata_obj(app, group_id, group_json, bucket=bucket) hrefs = [] # TBD resp_json = {"href": hrefs} resp = json_response(resp_json) log.response(request, resp=resp) return resp
def getDataNodeUrl(app, obj_id): """ Return host/port for datanode for given obj_id. Throw exception if service is not ready""" dn_urls = app["dn_urls"] node_number = app["node_number"] if app["node_state"] != "READY" or node_number not in dn_urls: log.info("Node_state:".format(app["node_state"])) log.info("node_number:".format(node_number)) msg = "Service not ready" log.warn(msg) raise HTTPServiceUnavailable() dn_number = getObjPartition(obj_id, app['node_count']) url = dn_urls[dn_number] return url