async def DELETE_Object(request): log.request(request) app = request.app delete_set = app["delete_set"] objid = request.match_info.get('id') if not isValidUuid(objid): log.warn(f"Invalid id: {objid}") raise HTTPBadRequest() if isSchema2Id(objid): # get rootid for this id collection = getCollectionForId(objid) if collection == "datasets": delete_set.add(objid) elif collection == "groups": # only need to do anything if this the root group if isRootObjId(objid): log.info(f"adding root group: {objid} to delete_set") delete_set.add(objid) else: log.info(f"ignoring delete non-root group: {objid}") elif collection == "datatypes": log.info(f"ignoring delete for datatype object: {objid}") else: log.error(f"Unexpected collection type: {collection}") resp_json = {} resp = json_response(resp_json) log.response(request, resp=resp) return resp
async def fetch(self, obj_id): log.debug(f"DomainCrawler - fetch for obj_id: {obj_id}") obj_json = await getObjectJson(self._app, obj_id, include_links=True, include_attrs=self._include_attrs) log.debug(f"DomainCrawler - for {obj_id} got json: {obj_json}") # including links, so don't need link count if "link_count" in obj_json: del obj_json["link_count"] self._obj_dict[obj_id] = obj_json if self._include_attrs: del obj_json["attributeCount"] # if this is a group, iterate through all the hard links and # add to the lookup ids set if getCollectionForId(obj_id) == "groups": links = obj_json["links"] log.debug(f"DomainCrawler links: {links}") for title in links: log.debug(f"DomainCrawler - got link: {title}") link_obj = links[title] if link_obj["class"] != 'H5L_TYPE_HARD': continue link_id = link_obj["id"] if link_id not in self._obj_dict: # haven't seen this object yet, get obj json log.debug(f"DomainCrawler - adding link_id: {link_id}") self._obj_dict[link_id] = {} # placeholder for obj id self._q.put_nowait(link_id) log.debug(f"DomainCrawler - fetch conplete obj_id: {obj_id}")
async def getObjectIdByPath(app, obj_id, h5path, refresh=False): """ Find the object at the provided h5path location. If not found raise 404 error. """ log.info("getObjectIdByPath obj_id: {} h5path: {} refresh: {}".format( obj_id, h5path, refresh)) if h5path.startswith("./"): h5path = h5path[2:] # treat as relative path links = h5path.split('/') for link in links: if not link: continue # skip empty link log.debug("getObjectIdByPath for objid: {} got link: {}".format( obj_id, link)) if getCollectionForId(obj_id) != "groups": # not a group, so won't have links msg = "h5path: {} not found".format(h5path) log.warn(msg) raise HTTPNotFound() req = getDataNodeUrl(app, obj_id) req += "/groups/" + obj_id + "/links/" + link log.debug("get LINK: " + req) link_json = await http_get(app, req) log.debug("got link_json: " + str(link_json)) if link_json["class"] != 'H5L_TYPE_HARD': # don't follow soft/external links msg = "h5path: {} not found".format(h5path) log.warn(msg) raise HTTPInternalServerError() obj_id = link_json["id"] # if we get here, we've traveresed the entire path and found the object return obj_id
async def validateAction(app, domain, obj_id, username, action): """ check that the given object belongs in the domain and that the requested action (create, read, update, delete, readACL, udpateACL) is permitted for the requesting user. """ meta_cache = app['meta_cache'] log.info( f"validateAction(domain={domain}, obj_id={obj_id}, username={username}, action={action})" ) # get domain JSON domain_json = await getDomainJson(app, domain) if "root" not in domain_json: msg = f"Expected root key for domain: {domain}" log.warn(msg) raise HTTPBadRequest(reason=msg) obj_json = None if obj_id in meta_cache: obj_json = meta_cache[obj_id] else: # fetch from DN collection = getCollectionForId(obj_id) req = getDataNodeUrl(app, obj_id) req += '/' + collection + '/' + obj_id bucket = getBucketForDomain(domain) params = {} if bucket: params["bucket"] = bucket obj_json = await http_get(app, req, params=params) meta_cache[obj_id] = obj_json log.debug("obj_json[root]: {} domain_json[root]: {}".format( obj_json["root"], domain_json["root"])) if obj_json["root"] != domain_json["root"]: log.info("unexpected root, reloading domain") domain_json = await getDomainJson(app, domain, reload=True) if "root" not in domain_json or obj_json["root"] != domain_json["root"]: msg = "Object id is not a member of the given domain" log.warn(msg) raise HTTPBadRequest(reason=msg) if action not in ("create", "read", "update", "delete", "readACL", "updateACL"): log.error(f"unexpected action: {action}") raise HTTPInternalServerError() reload = False try: aclCheck(domain_json, action, username) # throws exception if not allowed except HTTPForbidden: log.info( f"got HttpProcessing error on validate action for domain: {domain}, reloading..." ) # just in case the ACL was recently updated, refetch the domain reload = True if reload: domain_json = await getDomainJson(app, domain, reload=True) aclCheck(domain_json, action, username)
async def bucketGC(app): """ remove objects from db for any deleted root groups or datasets """ log.info("bucketGC start") async_sleep_time = int(config.get("async_sleep_time")) log.info("async_sleep_time: {}".format(async_sleep_time)) # update/initialize root object before starting GC while True: if app["node_state"] != "READY": log.info("bucketGC - waiting for Node state to be READY") await asyncio.sleep(async_sleep_time) continue # wait for READY state gc_ids = app["gc_ids"] while len(gc_ids) > 0: obj_id = gc_ids.pop() log.info(f"got gc id: {obj_id}") if not isValidUuid(obj_id): log.error(f"bucketGC - got unexpected gc id: {obj_id}") continue if not isSchema2Id(obj_id): log.warn(f"bucketGC - ignoring v1 id: {obj_id}") continue if getCollectionForId(obj_id) == "groups": if not isRootObjId(obj_id): log.error(f"bucketGC - unexpected non-root id: {obj_id}") continue log.info(f"bucketGC - delete root objs: {obj_id}") await removeKeys(app, obj_id) elif getCollectionForId(obj_id) == "datasets": log.info(f"bucketGC - delete dataset: {obj_id}") await removeKeys(app, obj_id) else: log.error(f"bucketGC - unexpected obj_id class: {obj_id}") log.info(f"bucketGC - sleep: {async_sleep_time}") await asyncio.sleep(async_sleep_time) # shouldn't ever get here log.error("bucketGC terminating unexpectedly")
async def get_collections(app, root_id): """ Return the object ids for given root. """ groups = {} datasets = {} datatypes = {} lookup_ids = set() lookup_ids.add(root_id) while lookup_ids: grp_id = lookup_ids.pop() req = getDataNodeUrl(app, grp_id) req += '/groups/' + grp_id + "/links" log.debug("collection get LINKS: " + req) try: links_json = await http_get(app, req) # throws 404 if doesn't exist except HTTPNotFound: log.warn(f"get_collection, group {grp_id} not found") continue log.debug(f"got links json from dn for group_id: {grp_id}") links = links_json["links"] log.debug(f"get_collection: got links: {links}") for link in links: if link["class"] != 'H5L_TYPE_HARD': continue link_id = link["id"] obj_type = getCollectionForId(link_id) if obj_type == "groups": if link_id in groups: continue # been here before groups[link_id] = {} lookup_ids.add(link_id) elif obj_type == "datasets": if link_id in datasets: continue datasets[link_id] = {} elif obj_type == "datatypes": if link_id in datatypes: continue datatypes[link_id] = {} else: log.error( f"get_collection: unexpected link object type: {obj_type}") HTTPInternalServerError() result = {} result["groups"] = groups result["datasets"] = datasets result["datatypes"] = datatypes return result
async def getPathForObjectId(app, parent_id, idpath_map, tgt_id=None): """ Search the object starting with the given parent_id. idpath should be a dict with at minimum the key: parent_id: <parent_path>. If tgt_id is not None, returns first path that matches the tgt_id or None if not found. If Tgt_id is no, returns the idpath_map. """ if not parent_id: log.error("No parent_id passed to getPathForObjectId") raise HTTPInternalServerError() if parent_id not in idpath_map: msg = "Obj {} expected to be found in idpath_map".format(parent_id) log.error(msg) raise HTTPInternalServerError() parent_path = idpath_map[parent_id] if parent_id == tgt_id: return parent_path req = getDataNodeUrl(app, parent_id) req += "/groups/" + parent_id + "/links" log.debug("getPathForObjectId LINKS: " + req) links_json = await http_get(app, req) log.debug( "getPathForObjectId got links json from dn for parent_id: {}".format( parent_id)) links = links_json["links"] h5path = None for link in links: if link["class"] != "H5L_TYPE_HARD": continue # ignore everything except hard links link_id = link["id"] if link_id in idpath_map: continue # this node has already been visited title = link["title"] if tgt_id is not None and link_id == tgt_id: # found it! h5path = op.join(parent_path, title) break idpath_map[link_id] = op.join(parent_path, title) if getCollectionForId(link_id) != "groups": continue h5path = await getPathForObjectId(app, link_id, idpath_map, tgt_id) # recursive call if tgt_id is not None and h5path: break return h5path
async def getObjectJson(app, obj_id, bucket=None, refresh=False, include_links=False, include_attrs=False): """ Return top-level json (i.e. excluding attributes or links by default) for a given obj_id. If refresh is False, any data present in the meta_cache will be returned. If not the DN will be queries, and any resultant data added to the meta_cache. Note: meta_cache values may be stale, but use of immutable data (e.g. type of a dataset) is always valid """ meta_cache = app['meta_cache'] obj_json = None if include_links or include_attrs: # links and attributes are subject to change, so always refresh refresh = True log.info(f"getObjectJson {obj_id}") if obj_id in meta_cache and not refresh: log.debug(f"found {obj_id} in meta_cache") obj_json = meta_cache[obj_id] else: req = getDataNodeUrl(app, obj_id) collection = getCollectionForId(obj_id) params = {} if include_links: params["include_links"] = 1 if include_attrs: params["include_attrs"] = 1 if bucket: params["bucket"] = bucket req += '/' + collection + '/' + obj_id obj_json = await http_get(app, req, params=params) # throws 404 if doesn't exist meta_cache[obj_id] = obj_json if obj_json is None: msg = f"Object: {obj_id} not found" log.warn(msg) raise HTTPNotFound() return obj_json
async def getObjectJson(app, obj_id, refresh=False): """ Return top-level json (i.e. excluding attributes or links) for a given obj_id. If refresh is False, any data present in the meta_cache will be returned. If not the DN will be queries, and any resultant data added to the meta_cache. Note: meta_cache values may be stale, but use of immutable data (e.g. type of a dataset) is always valid """ meta_cache = app['meta_cache'] obj_json = None log.info("getObjectJson {}".format(obj_id)) if obj_id in meta_cache and not refresh: log.debug("found {} in meta_cache".format(obj_id)) obj_json = meta_cache[obj_id] else: req = getDataNodeUrl(app, obj_id) collection = getCollectionForId(obj_id) req += '/' + collection + '/' + obj_id obj_json = await http_get(app, req) # throws 404 if doesn't exist meta_cache[obj_id] = obj_json if obj_json is None: msg = "Object: {} not found".format(obj_id) log.warn(msg) raise HTTPNotFound() return obj_json
async def GET_Links(request): """HTTP method to return JSON for link collection""" log.request(request) app = request.app params = request.rel_url.query group_id = request.match_info.get('id') if not group_id: msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) if not isValidUuid(group_id, obj_class="Group"): msg = "Invalid group id: {}".format(group_id) log.warn(msg) raise HTTPBadRequest(reason=msg) limit = None if "Limit" in params: try: limit = int(params["Limit"]) except ValueError: msg = "Bad Request: Expected int type for limit" log.warn(msg) raise HTTPBadRequest(reason=msg) marker = None if "Marker" in params: marker = params["Marker"] username, pswd = getUserPasswordFromRequest(request) if username is None and app['allow_noauth']: username = "******" else: await validateUserPassword(app, username, pswd) domain = getDomainFromRequest(request) if not isValidDomain(domain): msg = "Invalid host value: {}".format(domain) log.warn(msg) raise HTTPBadRequest(reason=msg) await validateAction(app, domain, group_id, username, "read") req = getDataNodeUrl(app, group_id) req += "/groups/" + group_id + "/links" query_sep = '?' if limit is not None: req += query_sep + "Limit=" + str(limit) query_sep = '&' if marker is not None: req += query_sep + "Marker=" + marker log.debug("get LINKS: " + req) links_json = await http_get(app, req) log.debug("got links json from dn for group_id: {}".format(group_id)) links = links_json["links"] # mix in collection key, target and hrefs for link in links: if link["class"] == "H5L_TYPE_HARD": collection_name = getCollectionForId(link["id"]) link["collection"] = collection_name target_uri = '/' + collection_name + '/' + link["id"] link["target"] = getHref(request, target_uri) link_uri = '/groups/' + group_id + '/links/' + link['title'] link["href"] = getHref(request, link_uri) resp_json = {} resp_json["links"] = links hrefs = [] group_uri = '/groups/' + group_id hrefs.append({ 'rel': 'self', 'href': getHref(request, group_uri + '/links') }) hrefs.append({'rel': 'home', 'href': getHref(request, '/')}) hrefs.append({'rel': 'owner', 'href': getHref(request, group_uri)}) resp_json["hrefs"] = hrefs resp = await jsonResponse(request, resp_json) log.response(request, resp=resp) return resp
async def GET_Link(request): """HTTP method to return JSON for a group link""" log.request(request) app = request.app group_id = request.match_info.get('id') if not group_id: msg = "Missing group id" log.warn(msg) raise HTTPBadRequest(reason=msg) if not isValidUuid(group_id, obj_class="Group"): msg = "Invalid group id: {}".format(group_id) log.warn(msg) raise HTTPBadRequest(reason=msg) link_title = request.match_info.get('title') validateLinkName(link_title) username, pswd = getUserPasswordFromRequest(request) if username is None and app['allow_noauth']: username = "******" else: await validateUserPassword(app, username, pswd) domain = getDomainFromRequest(request) if not isValidDomain(domain): msg = "Invalid host value: {}".format(domain) log.warn(msg) raise HTTPBadRequest(reason=msg) await validateAction(app, domain, group_id, username, "read") req = getDataNodeUrl(app, group_id) req += "/groups/" + group_id + "/links/" + link_title log.debug("get LINK: " + req) link_json = await http_get(app, req) log.debug("got link_json: " + str(link_json)) resp_link = {} resp_link["title"] = link_title resp_link["class"] = link_json["class"] if link_json["class"] == "H5L_TYPE_HARD": resp_link["id"] = link_json["id"] resp_link["collection"] = getCollectionForId(link_json["id"]) elif link_json["class"] == "H5L_TYPE_SOFT": resp_link["h5path"] = link_json["h5path"] elif link_json["class"] == "H5L_TYPE_EXTERNAL": resp_link["h5path"] = link_json["h5path"] resp_link["h5domain"] = link_json["h5domain"] else: log.warn("Unexpected link class: {}".format(link_json["class"])) resp_json = {} resp_json["link"] = resp_link resp_json["created"] = link_json["created"] # links don't get modified, so use created timestamp as lastModified resp_json["lastModified"] = link_json["created"] hrefs = [] group_uri = '/groups/' + group_id hrefs.append({ 'rel': 'self', 'href': getHref(request, group_uri + '/links/' + link_title) }) hrefs.append({'rel': 'home', 'href': getHref(request, '/')}) hrefs.append({'rel': 'owner', 'href': getHref(request, group_uri)}) if link_json["class"] == "H5L_TYPE_HARD": target = '/' + resp_link["collection"] + '/' + resp_link["id"] hrefs.append({'rel': 'target', 'href': getHref(request, target)}) resp_json["hrefs"] = hrefs resp = await jsonResponse(request, resp_json) log.response(request, resp=resp) return resp
def scanRootCallback(app, s3keys): log.debug(f"scanRootCallback, {len(s3keys)} items") if isinstance(s3keys, list): log.error("got list result for s3keys callback") raise ValueError("unexpected callback format") results = app["scanRoot_results"] if results: log.debug(f"previous scanRoot_results:".format(results)) for s3key in s3keys.keys(): if not isS3ObjKey(s3key): log.info(f"not s3obj key, ignoring: {s3key}") continue objid = getObjId(s3key) etag = None obj_size = None lastModified = None item = s3keys[s3key] if "ETag" in item: etag = item["ETag"] if "Size" in item: obj_size = item["Size"] if "LastModified" in item: lastModified = item["LastModified"] log.debug(f"{objid}: {etag} {obj_size} {lastModified}") if lastModified > results["lastModified"]: log.debug(f"changing lastModified from: {results['lastModified']} to {lastModified}") results["lastModified"] = lastModified is_chunk = False if isValidChunkId(objid): is_chunk = True results["num_chunks"] += 1 results["allocated_bytes"] += obj_size else: results["metadata_bytes"] += obj_size if is_chunk or getCollectionForId(objid) == "datasets": if is_chunk: dsetid = getDatasetId(objid) else: dsetid = objid datasets = results["datasets"] if dsetid not in datasets: dataset_info = {} dataset_info["lastModified"] = 0 dataset_info["num_chunks"] = 0 dataset_info["allocated_bytes"] = 0 datasets[dsetid] = dataset_info dataset_info = datasets[dsetid] if lastModified > dataset_info["lastModified"]: dataset_info["lastModified"] = lastModified if is_chunk: dataset_info["num_chunks"] += 1 dataset_info["allocated_bytes"] += obj_size elif getCollectionForId(objid) == "groups": results["num_groups"] += 1 elif getCollectionForId(objid) == "datatypes": results["num_datatypes"] += 1 else: log.error(f"Unexpected collection type for id: {objid}")