def ensure_folder(pathname): tgt_path = tgt_folder if not pathname.startswith(tgt_path): return False if hsds_global: endpoint = hsds_global else: endpoint = None folder = h5pyd.Folder(tgt_path, endpoint=endpoint, username=username, password=password, bucket=tgt_bucket) names = pathname[len(tgt_path):].split("/") for name in names: if not name: continue tgt_path = os.path.join(tgt_path, name) + '/' if name not in folder: print(f"creating folder: {tgt_path}") folder = h5pyd.Folder(tgt_path, mode="w", endpoint=endpoint, username=username, password=password, bucket=tgt_bucket) return True
def getHomeFolder(username): if not username: return None dir = h5pyd.Folder('/home/') # get folder object for root homefolder = None for name in dir: # we should come across the given domain if username.startswith(name): # check any folders where the name matches at least part of the username # e.g. folder: "/home/bob/" for username "*****@*****.**" path = '/home/' + name + '/' try: f = h5pyd.Folder(path) except IOError as ioe: print("got ioe:", ioe) continue except Exception as e: print("got exception:", e) continue if f.owner == username: homefolder = path f.close() if homefolder: break dir.close() return homefolder
def check_res_file(res_file): """ Check resource to see if the given path - It belongs to a multi-file handler - Is on local disk - Is a hsds path Parameters ---------- res_file : str Filepath to single resource file, unix style multi-file path like /h5_dir/prefix*suffix.h5, or an hsds filepath (filename of hsds path can also contain wildcards *) Returns ------- multi_h5_res : bool Boolean flag to use a MultiFileResource handler hsds : bool Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS behind HSDS """ multi_h5_res = False hsds = False if os.path.isfile(res_file): pass elif '*' in res_file: multi_h5_res = True elif os.path.isdir(res_file): msg = ('Cannot parse directory, need to add wildcard * suffix: {}'. format(res_file)) raise FileInputError(msg) else: try: import h5pyd hsds_dir = os.path.dirname(res_file) with h5pyd.Folder(hsds_dir + '/') as f: hsds = True fps = [ f'{hsds_dir}/{fn}' for fn in f if fnmatch(f'{hsds_dir}/{fn}', res_file) ] if not any(fps): msg = ( '{} is not a valid HSDS file path!'.format(res_file)) raise FileNotFoundError(msg) elif len(fps) > 1: multi_h5_res = True except Exception as ex: msg = ("{} is not a valid file path, and HSDS " "cannot be check for a file at this path:{}!".format( res_file, ex)) raise FileNotFoundError(msg) from ex return multi_h5_res, hsds
def getFolder(domain, mode='r'): username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] bucket = cfg["hs_bucket"] dir = h5py.Folder(domain, mode=mode, endpoint=endpoint, username=username, password=password, bucket=bucket) return dir
def getFolder(domain): username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] #print("getFolder", domain) dir = h5py.Folder(domain, endpoint=endpoint, username=username, password=password) return dir
def getDomainInfo(domain, cfg): """ get info about the domain and print """ username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] if domain.endswith('/'): is_folder = True obj_class = "Folder" else: is_folder = False obj_class = "Domain" try: if domain.endswith('/'): f = h5pyd.Folder(domain, mode='r', endpoint=endpoint, username=username, password=password, use_cache=True) else: f = h5pyd.File(domain, mode='r', endpoint=endpoint, username=username, password=password, use_cache=True) except IOError as oe: if oe.errno in (404, 410): # Not Found sys.exit("domain: {} not found".format(domain)) elif oe.errno == 401: # Unauthorized sys.exit("Authorization failure") elif oe.errno == 403: # Forbidden sys.exit("Not allowed") else: sys.exit("Unexpected error: {}".format(oe)) timestamp = datetime.fromtimestamp(int(f.modified)) if is_folder: print("folder: {}".format(domain)) print(" owner: {}".format(f.owner)) print(" last modified: {}".format(timestamp)) else: # report HDF objects (groups, datasets, and named datatypes) vs. allocated chunks num_objects = f.num_groups + f.num_datatypes + f.num_datasets num_chunks = f.num_objects - num_objects print("domain: {}".format(domain)) print(" owner: {}".format(f.owner)) print(" id: {}".format(f.id.id)) print(" last modified: {}".format(timestamp)) print(" total_size: {}".format(format_size(f.total_size))) print(" allocated_bytes: {}".format(format_size(f.allocated_bytes))) print(" num objects: {}".format(num_objects)) print(" num chunks: {}".format(num_chunks)) f.close()
def createFolder(domain): username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] #print("getFolder", domain) owner = None if "hs_owner" in cfg: owner=cfg["hs_owner"] dir = h5py.Folder(domain, mode='x', endpoint=endpoint, username=username, password=password, owner=owner) return dir
def getFolder(domain, mode="r"): username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] dir = h5pyd.Folder(domain, endpoint=endpoint, username=username, password=password, mode=mode) return dir
def getFolder(domain): username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] bucket = cfg["hs_bucket"] pattern = cfg["pattern"] query = cfg["query"] batch_size = 100 # use smaller batchsize for interactively listing of large collections dir = h5py.Folder(domain, endpoint=endpoint, username=username, password=password, bucket=bucket, pattern=pattern, query=query, batch_size=batch_size) return dir
def rm_objects(self): if not self.username or not self.password or not self.endpoint: return if not self.temp_dir: return folder = h5pyd.Folder(self.temp_dir, mode='a', endpoint=self.endpoint, username=self.username, password=self.password) if len(folder) == 0: time.sleep(10) # allow time for HSDS to sync to for name in folder: del folder[name]
def getHomeFolder(): username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] if not username: return None dir = h5pyd.Folder('/home/', username=username, password=password, endpoint=endpoint) # get folder object for root homefolder = None for name in dir: # we should come across the given domain if username.startswith(name): # check any folders where the name matches at least part of the username # e.g. folder: "/home/bob/" for username "*****@*****.**" path = '/home/' + name + '/' try: f = h5pyd.Folder(path, username=username, password=password, endpoint=endpoint) except IOError as ioe: logging.info("find home folder - got ioe: {}".format(ioe)) continue except Exception as e: logging.warn("find home folder - got exception: {}".format(e)) continue if f.owner == username: homefolder = path f.close() if homefolder: break dir.close() return homefolder
def _get_hsds_file_paths(h5_path, hsds_kwargs=None): """ Get a list of h5 filepaths matching the h5_path specification from HSDS Parameters ---------- h5_path : str Unix shell style pattern path with * wildcards to multi-file resource file sets. Files must have the same coordinates but can have different datasets or time indexes. hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None Returns ------- file_paths : list List of filepaths for this handler to handle. """ import h5pyd if hsds_kwargs is None: hsds_kwargs = {} hsds_dir = os.path.dirname(h5_path) fn = os.path.basename(h5_path) if '*' in hsds_dir: msg = ('HSDS path specifications cannot handle wildcards in the ' 'directory name! The directory must be explicit but the ' 'filename can have wildcards. This HSDS h5_path input ' 'cannot be used: {}'.format(h5_path)) raise FileNotFoundError(msg) if not fn: msg = ('h5_path must be a unix shell style pattern with ' 'wildcard * in order to find files, but received ' 'directory specification: {}'.format(h5_path)) raise FileInputError(msg) with h5pyd.Folder(hsds_dir + '/', **hsds_kwargs) as f: file_paths = [ f'{hsds_dir}/{fn}' for fn in f if fnmatch(f'{hsds_dir}/{fn}', h5_path) ] return file_paths
def check_res_file(res_file): """ Check resource to see if the given path - It belongs to a multi-file handler - Is on local disk - Is a hsds path Parameters ---------- res_file : str Filepath to single resource file, multi-h5 directory, or /h5_dir/prefix*suffix Returns ------- multi_h5_res : bool Boolean flag to use a MultiFileResource handler hsds : bool Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS behind HSDS """ multi_h5_res = False hsds = False if os.path.isdir(res_file) or ('*' in res_file): multi_h5_res = True else: if not os.path.isfile(res_file): try: import h5pyd hsds_dir, hsds_file = os.path.split(res_file) with h5pyd.Folder(hsds_dir + '/') as f: hsds = True if hsds_file not in f: msg = ('{} is not a valid HSDS file path!'.format( res_file)) print(msg) raise FileNotFoundError(msg) except Exception as ex: msg = ("{} is not a valid file path, and HSDS " "cannot be check for a file at this path:{}!".format( res_file, ex)) raise FileNotFoundError(msg) return multi_h5_res, hsds
def __init__(self, loc, **kwargs): """A set of FIREfly domains that satisfy filter criteria. Parameters ---------- loc : str A Kita server URI where FIREfly flight data are hosted. Other parameters ---------------- mode : str Access mode. Default is "r". pattern : str A Python regex for filtering FIREfly flight file names. query : str A boolean expression for filtering FIREfly flights' data. kwargs : dict Any remaining named arguments are assumed to be flight data filtering parameters or Kita server access information. """ self._mode = kwargs.pop('mode', 'r') pattern = kwargs.pop('pattern', None) query = kwargs.pop('query', None) if query is None: self._flight_filter, self._data_filter = filter_builder(kwargs) else: self._flight_filter = query self._data_filter = None self._loc = h5pyd.Folder(loc, mode=self._mode, pattern=pattern, query=self._flight_filter, **kwargs) loc = self._loc.domain self._domains = [loc + d for d in self._loc] self._kwargs = kwargs
def main(): if len(sys.argv) == 1 or (sys.argv[1] == "-h" or sys.argv[1] == "--help"): printUsage() sys.exit(1) bucket = None if len(sys.argv) == 4 and sys.argv[1] == "--bucket": bucket = sys.argv[2] print("using bucket:", bucket) folder_path = sys.argv[-1] if folder_path[0] != '/' or folder_path[-1] != '/': print("domain folder must start and end with '/") printUsage() folder = h5pyd.Folder(folder_path, mode='r+') domains = [] for domain in folder: domains.append(domain) for domain in domains: print(f"removing: {domain}") del folder[domain]
# Utilities. The full HDF5 REST Server copyright notice, including # # terms governing use, modification, and redistribution, is contained in # # the file COPYING, which can be found at the root of the source code # # distribution tree. If you do not have access to this file, you may # # request a copy from [email protected]. # ############################################################################## import sys import os import config import h5pyd if config.get("use_h5py"): sys.exit("use_h5py") if "H5PYD_TEST_FOLDER" not in os.environ: sys.exit("set H5PYD_TEST_FOLDER environment not set") folder_path = os.environ["H5PYD_TEST_FOLDER"] if not folder_path.startswith("/"): # HSDS expects folder paths to start with a slash (as opposed to DNS format) sys.exit("not HSDS path") try: h5pyd.Folder(folder_path) # will trigger error with h5serv except Exception: sys.exit("Server doesn't support Folder objects")
if 'BasicFusionFiles' not in sdb.list_domains()['DomainNames']: sdb.create_domain(DomainName="BasicFusionFiles") if args.list: q = sdb.select( SelectExpression='select * from BasicFusionFiles where MISR_Path="P125"' ) if 'Items' in q: for item in q['Items']: print(item) sys.exit(0) misr_paths = h5py.Folder(domain_name=args.domain, endpoint=args.hsds_endpoint, username=args.user, password=args.password) for path in misr_paths: h5_files = h5py.Folder(domain_name=args.domain + path + "/", endpoint=args.hsds_endpoint, username=args.user, password=args.password) for h5_file in h5_files: root_name = h5_file.split('.')[0] satellite, dataset, level, orbit, timestamp, _, _ = root_name.split( "_") year = timestamp[:4] month = timestamp[4:6] day = timestamp[6:8] print(year, month, day)
import sys import subprocess import h5pyd HSDS_BUCKET = "firefly-hsds" H5_FOLDER = "/FIREfly/h5/" folder = h5pyd.Folder(H5_FOLDER, bucket=HSDS_BUCKET) # get folder object domain_names = [] for domain in folder: domain_names.append(domain) if not domain_names: print("no domains found!") sys.exit() print(f"found {len(domain_names)} domains") for domain_name in domain_names: domain = H5_FOLDER + domain_name print(f"removing {domain}") rc = subprocess.run(["hsrm", "--bucket", HSDS_BUCKET, domain]) if rc.returncode > 0: print(f"unable to delete {domain}") sys.exit(-1) print("done!")
def getDomainInfo(domain, cfg): """ get info about the domain and print """ username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] bucket = cfg["hs_bucket"] if "rescan" in cfg and cfg["rescan"]: mode = "r+" # need write intent else: mode = 'r' if domain.endswith('/'): is_folder = True else: is_folder = False try: if is_folder: f = h5pyd.Folder(domain, mode=mode, endpoint=endpoint, username=username, password=password, bucket=bucket, use_cache=True) else: f = h5pyd.File(domain, mode=mode, endpoint=endpoint, username=username, password=password, bucket=bucket, use_cache=False) except IOError as oe: if oe.errno in (404, 410): # Not Found sys.exit("domain: {} not found".format(domain)) elif oe.errno == 401: # Unauthorized sys.exit("Authorization failure") elif oe.errno == 403: # Forbidden sys.exit("Not allowed") else: sys.exit("Unexpected error: {}".format(oe)) timestamp = datetime.fromtimestamp(int(f.modified)) if not is_folder and f.last_scan: last_scan = datetime.fromtimestamp(int(f.last_scan)) else: last_scan = None if is_folder: print("folder: {}".format(domain)) print(" owner: {}".format(f.owner)) print(" last modified: {}".format(timestamp)) else: if "rescan" in cfg and cfg["rescan"]: f.run_scan() # report HDF objects (groups, datasets, and named datatypes) vs. allocated chunks num_objects = f.num_groups + f.num_datatypes + f.num_datasets if f.num_chunks > 0: num_chunks = f.num_chunks else: # older storeinfo format doesn't have num_chunks, so calculate num_chunks = f.num_objects - num_objects print("domain: {}".format(domain)) print(" owner: {}".format(f.owner)) print(" id: {}".format(f.id.id)) print(" last modified: {}".format(timestamp)) if last_scan: print(" last scan: {}".format(last_scan)) if f.md5_sum: print(" md5 sum: {}".format(f.md5_sum)) print(" total_size: {}".format(format_size(f.total_size))) print(" allocated_bytes: {}".format(format_size(f.allocated_bytes))) if f.metadata_bytes: print(" metadata_bytes: {}".format( format_size(f.metadata_bytes))) if f.linked_bytes: print(" linked_bytes: {}".format(format_size( f.linked_bytes))) print(" num objects: {}".format(num_objects)) print(" num chunks: {}".format(num_chunks)) if f.num_linked_chunks: print(" linked chunks: {}".format(f.num_linked_chunks)) f.close()
def main(): cfg["cmd"] = sys.argv[0].split('/')[-1] if cfg["cmd"].endswith(".py"): cfg["cmd"] = "python " + cfg["cmd"] cfg["verbose"] = False perm_abvr = {'c':'create', 'r': 'read', 'u': 'update', 'd': 'delete', 'e': 'readACL', 'p':'updateACL'} fields = ('username', 'create', 'read', 'update', 'delete', 'readACL', 'updateACL') domain = None perm = None loglevel = logging.ERROR logfname = None usernames = [] add_list = set() remove_list = set() if len(sys.argv) == 1 or sys.argv[1] == "-h": printUsage() # setup logging logging.basicConfig(filename=logfname, format='%(asctime)s %(message)s', level=loglevel) logging.debug("set log_level to {}".format(loglevel)) argn = 1 while argn < len(sys.argv): arg = sys.argv[argn] val = None if len(sys.argv) > argn + 1: val = sys.argv[argn+1] logging.debug("arg:", arg, "val:", val) if domain is None and arg in ("-v", "--verbose"): cfg["verbose"] = True argn += 1 elif domain is None and arg == "--loglevel": val = val.upper() if val == "DEBUG": loglevel = logging.DEBUG elif val == "INFO": loglevel = logging.INFO elif val in ("WARN", "WARNING"): loglevel = logging.WARNING elif val == "ERROR": loglevel = logging.ERROR else: printUsage() argn += 2 elif domain is None and arg == '--logfile': logfname = val argn += 2 elif domain is None and arg in ("-h", "--help"): printUsage() elif domain is None and arg in ("-e", "--endpoint"): cfg["hs_endpoint"] = val argn += 2 elif domain is None and arg in ("-u", "--username"): cfg["hs_username"] = val argn += 2 elif domain is None and arg in ("-p", "--password"): cfg["hs_password"] = val argn += 2 elif domain is None and arg[0] in ('-', '+'): print("No domain given") printUsage() elif domain is None: logging.debug("get domain") domain = arg if domain[0] != '/': print("Domain must start with '/'") printUsage() logging.debug("domain:", domain) argn += 1 elif arg[0] == '+': logging.debug("got plus") if len(usernames) > 0: logging.debug("usernames:", usernames) printUsage() add_list = set(arg[1:]) logging.info("add_list:", add_list) argn += 1 elif arg[0] == '-': logging.debug("got minus") if len(usernames) > 0: printUsage() remove_list = set(arg[1:]) logging.info("remove_list:", remove_list) argn += 1 else: logging.info("got username:"******"Invalid username:"******"domain:", domain) logging.info("add_list:", add_list) logging.info("remove_list:", remove_list) logging.info("usernames:", usernames) if len(usernames) == 0 and (add_list or remove_list): print("At least one username must be given to add/remove permissions") printUsage() if domain is None: print("no domain specified") sys.exit(1) conflicts = list(add_list & remove_list) if len(conflicts) > 0: print("permission: ", conflicts[0], " permission flag set for both add and remove") sys.exit(1) mode = 'r' if add_list or remove_list: mode = 'a' # we'll be updating the domain perm = {} for x in add_list: if x not in perm_abvr: print("Permission flag: {} is not valid - must be one of 'crudep;".format(x)) sys.exit(1) perm_name = perm_abvr[x] perm[perm_name] = True for x in remove_list: if x not in perm_abvr: print("Permission flag: {} is not valid - must be one of 'crudep;".format(x)) sys.exit(1) perm_name = perm_abvr[x] perm[perm_name] = False logging.info("perm:", perm) # open the domain or folder try: if domain[-1] == '/': f = h5pyd.Folder(domain, mode=mode, endpoint=cfg["hs_endpoint"], username=cfg["hs_username"], password=cfg["hs_password"]) else: f = h5pyd.File(domain, mode=mode, endpoint=cfg["hs_endpoint"], username=cfg["hs_username"], password=cfg["hs_password"]) except IOError as ioe: if ioe.errno in (404, 410): print("domain not found") sys.exit(1) elif ioe.errno in (401, 403): print("access is not authorized") sys.exit(1) else: print("Unexpected error:", ioe) sys.exit(1) # update/add ACL if permission flags have been set if perm: default_acl = {'updateACL': False, 'delete': False, 'create': False, 'read': False, 'update': False, 'readACL': False, 'userName': '******' } # note: list.copy not supported in py2.7, copy by hand for now # update_names = usernames.copy() update_names = [] for username in usernames: update_names.append(username) if not update_names: update_names.append("default") for username in update_names: # get user's ACL if it exist acl = getACL(f, username=username) if acl is None: acl = default_acl.copy() acl["userName"] = username logging.info("updating acl to: {}".format(acl)) # mix in any permission changes for k in perm: acl[k] = perm[k] try: f.putACL(acl) except IOError as ioe: if ioe.errno in (401, 403): print("access is not authorized") else: print("Unexpected error:", ioe) sys.exit(1) # # read the acls # if len(usernames) == 0: # no usernames, dump all ACLs try: acls = f.getACLs() except IOError as ioe: if ioe.errno == 403: print("User {} does not have permission to read ACL for this domain".format(cfg["hs_username"])) sys.exit(1) elif ioe.errno == 401: print("username/password needs to be provided") sys.exit(1) else: print("Unexpected error: {}".format(ioe)) print("%015s %08s %08s %08s %08s %08s %08s " % fields) print("-"*80) for acl in acls: vals = (acl["userName"], acl["create"], acl["read"], acl["update"], acl["delete"], acl["readACL"], acl["updateACL"]) print("%015s %08s %08s %08s %08s %08s %08s " % vals) else: header_printed = False # don't print header until we have at least one ACL for username in usernames: try: acl = f.getACL(username) if not header_printed: print("%015s %08s %08s %08s %08s %08s %08s " % fields) print("-"*80) header_printed = True vals = (acl["userName"], acl["create"], acl["read"], acl["update"], acl["delete"], acl["readACL"], acl["updateACL"]) print("%015s %08s %08s %08s %08s %08s %08s " % vals) except IOError as ioe: if ioe.errno == 403: print("User {} does not have permission to read ACL for this domain".format(cfg["hs_username"])) sys.exit(1) elif ioe.errno == 401: print("username/password needs to be provided") sys.exit(1) elif ioe.errno == 404: print(username, "<NONE>") else: print("Unexpected error:", ioe) sys.exit(1) f.close()
# Main # print("load_file.py") if "H5PYD_TEST_FOLDER" not in os.environ: print("set H5PYD_TEST_FOLDER environment not set") sys.exit(1) test_folder = os.environ["H5PYD_TEST_FOLDER"] data_dir = "data" s3_http_path = "https://s3.amazonaws.com/hdfgroup/data/hdf5test/" parent = h5pyd.Folder(test_folder) filenames = config.get_test_filenames() if not os.path.exists(data_dir): # make data directory for downloaded HDF5 files os.mkdir(data_dir) for filename in filenames: print(filename) domain_path = os.path.join(test_folder, filename) print(domain_path) if filename in parent: print("found") continue # check to see if the file has already been downloaded hdf5_path = os.path.join(data_dir, filename)