def __init__(self, path, mode='a', invalid_netcdf=None, **kwargs): try: if isinstance(path, str): if path.startswith(('http://', 'https://', 'hdf5://')): if no_h5pyd: raise ImportError( "No module named 'h5pyd'. h5pyd is required for " "opening urls: {}".format(path)) try: with h5pyd.File(path, 'r') as f: # noqa pass self._preexisting_file = True except IOError: self._preexisting_file = False self._h5file = h5pyd.File(path, mode, **kwargs) else: self._preexisting_file = os.path.exists(path) self._h5file = h5py.File(path, mode, **kwargs) else: # file-like object if h5py.__version__ < LooseVersion('2.9.0'): raise TypeError( "h5py version ({}) must be greater than 2.9.0 to load " "file-like objects.".format(h5py.__version__)) else: self._preexisting_file = mode in {'r', 'r+', 'a'} self._h5file = h5py.File(path, mode, **kwargs) except Exception: self._closed = True raise else: self._closed = False self._mode = mode self._root = self self._h5path = '/' self.invalid_netcdf = invalid_netcdf # If invalid_netcdf is None, we'll disable writing _NCProperties only # if we actually use invalid NetCDF features. self._write_ncproperties = invalid_netcdf is not True # These maps keep track of dimensions in terms of size (might be # unlimited), current size (identical to size for limited dimensions), # their position, and look-up for HDF5 datasets corresponding to a # dimension. self._dim_sizes = ChainMap() self._current_dim_sizes = ChainMap() self._dim_order = ChainMap() self._all_h5groups = ChainMap(self._h5group) super(File, self).__init__(self, self._h5path)
def filter(self, cond): """Filter flight segment data into new segments. Parameters ---------- cond : str Condition (expression) for filtering flight segment data. Returns ------- list of firefly.FlightSegment A list of new flight segments with the data that matched filtering condition. """ # Filter the data... data = self._flight.query(cond, inplace=False) # Separate filtered data into continuous segments... row_idx = np.where(np.in1d(self._flight.index, data.index))[0] seg_start = np.nonzero(np.diff(row_idx, prepend=row_idx[0]) > 1)[0] if seg_start.size == 0: new_seg = self.__new__(type(self)) new_seg._domain = h5pyd.File(self._domain.filename, self._domain.mode, **self._other) new_seg._other = self._other new_seg._flight = data new_seg._bbox = None return list(new_seg) else: from_idx = 0 segments = list() for start in seg_start.tolist(): new_seg = self.__new__(type(self)) new_seg._domain = h5pyd.File(self._domain.filename, self._domain.mode, **self._other) new_seg._other = self._other new_seg._flight = data.iloc[from_idx:start] new_seg._bbox = None segments.append(new_seg) from_idx = start new_seg = self.__new__(type(self)) new_seg._domain = h5pyd.File(self._domain.filename, self._domain.mode, **self._other) new_seg._other = self._other new_seg._flight = data.iloc[from_idx:] new_seg._bbox = None segments.append(new_seg) return segments
def test_wrapper(): ''' This function tests the functionality of the wrapper module. ''' wtk = h5pyd.File('/nrel/wtk-us.h5', 'r') test_gov =\ wrapper.wrapper(wtk, 'Salem', 'OR', land_available=20, goal=80) assert type(test_gov) == alt.vegalite.v3.api.LayerChart,\ 'Test result type (%s) is not of type (%s).'\ % (str(test_gov), 'alt.vegalite.v3.api.LayerChart') # now to test the residential branch test_res =\ wrapper.wrapper(wtk, 'Salem', 'OR', 100000, goal=100, residential=True, household_size=8) result = [149129.03, 307.0, 53454.93, 12144.06, 8.13, 4353.01] assert type(test_res) == type(result),\ 'Test result type (%s) is not of type list/array.'\ % str(type(test_res)) assert len(test_res) == len(result),\ 'Test result length (%s) is not the expected length (%s).'\ % (str(len(test_res), len(result))) for i in range(6): assert test_res[i] == result[i],\ 'Entry in test result list (%s) is not equal to expected result\ (%s).' % (str(test_res[i]), str(result[i]))
def open_file(self, mode="r"): subdirs = self.path.split("/") domain_name = ".".join(list(reversed(subdirs)) + ["hdfgroup.org"]) print(domain_name) # f = h5pyd.File(domain_name, mode, "http://slowpoke2:5000") f = h5pyd.File(domain_name, "r", "http://slowpoke2:5000") return f
def create_indexer(cls, domain, endpoint, top_level_path): """ Create an instance as long as the dataset exists """ with h5pyd.File(domain, 'r', endpoint=endpoint) as hf: path_is_valid = True if top_level_path in hf else False return cls(domain, endpoint, top_level_path) if path_is_valid else None
def version(self): with h5pyd.File(self.domain, 'r', endpoint=self.endpoint) as hf: if 'chianti_version' in hf[self.top_level_path].attrs: version = hf[self.top_level_path].attrs['chianti_version'] else: version = None return version
def __getitem__(self, key): """ NOTE: There seems to be a weird in bug in h5pyd where if a dataset is returned directly to a numpy array, the slicing/indexing fails. Thus, all of the gymnastics around returning datasets and casting to types appropriately. """ if type(key) is int: raise NotImplementedError('Iteration not supported.') with h5pyd.File(self.domain, 'r', endpoint=self.endpoint) as hf: if key not in self: return None ds = hf[self.top_level_path][key] if isinstance(ds, h5pyd.Group): data = DataIndexerRemote.create_indexer( self.domain, self.endpoint, '/'.join([self.top_level_path, key])) else: # Scalars cannot be sliced if not ds.shape: data = np.array(ds.value) else: data = ds[:] # Some things are just arrays if ds.attrs['unit'] == 'SKIP' or ds.dtype == 'object': data = data.astype(ds.dtype) else: data = u.Quantity(data, ds.attrs['unit'], dtype=ds.dtype) if '|S' in data.dtype.str: data = data.astype(str) return data
def getFile(domain): username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] #print("getFile", domain) fh = h5py.File(domain, mode='r', endpoint=endpoint, username=username, password=password) return fh
def open(filename): # pylint:disable=redefined-builtin """ Open a file as an `h5py`-like object. Format supported: - h5 files, if `h5py` module is installed - SPEC files exposed as a NeXus layout - raster files exposed as a NeXus layout (if `fabio` is installed) - fio files exposed as a NeXus layout - Numpy files ('npy' and 'npz' files) The filename can be trailled an HDF5 path using the separator `::`. In this case the object returned is a proxy to the target node, implementing the `close` function and supporting `with` context. The file is opened in read-only mode. :param str filename: A filename which can containt an HDF5 path by using `::` separator. :raises: IOError if the file can't be loaded or path can't be found :rtype: h5py-like node """ url = silx.io.url.DataUrl(filename) if url.scheme() in [None, "file", "silx"]: # That's a local file if not url.is_valid(): raise IOError("URL '%s' is not valid" % filename) h5_file = _open_local_file(url.file_path()) elif url.scheme() in ["fabio"]: raise IOError("URL '%s' containing fabio scheme is not supported" % filename) else: # That's maybe an URL supported by h5pyd uri = urllib.parse.urlparse(filename) if h5pyd is None: raise IOError("URL '%s' unsupported. Try to install h5pyd." % filename) path = uri.path endpoint = "%s://%s" % (uri.scheme, uri.netloc) if path.startswith("/"): path = path[1:] return h5pyd.File(path, 'r', endpoint=endpoint) if url.data_slice(): raise IOError("URL '%s' containing slicing is not supported" % filename) if url.data_path() in [None, "/", ""]: # The full file is requested return h5_file else: # Only a children is requested if url.data_path() not in h5_file: msg = "File '%s' does not contain path '%s'." % (filename, url.data_path()) raise IOError(msg) node = h5_file[url.data_path()] proxy = _MainNode(node, h5_file) return proxy
def __init__(self, h5_file, unscale=True, hsds=False, str_decode=True, group=None): """ Parameters ---------- h5_file : str Path to .h5 resource file unscale : bool Boolean flag to automatically unscale variables on extraction hsds : bool Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS behind HSDS str_decode : bool Boolean flag to decode the bytestring meta data into normal strings. Setting this to False will speed up the meta data read. group : str Group within .h5 resource file to open """ self.h5_file = h5_file if hsds: import h5pyd self._h5 = h5pyd.File(self.h5_file, 'r') else: self._h5 = h5py.File(self.h5_file, 'r') self._group = group self._unscale = unscale self._meta = None self._time_index = None self._str_decode = str_decode self._i = 0
def getDomainInfo(domain, cfg): """ get info about the domain and print """ username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] if domain.endswith('/'): is_folder = True obj_class = "Folder" else: is_folder = False obj_class = "Domain" try: if domain.endswith('/'): f = h5pyd.Folder(domain, mode='r', endpoint=endpoint, username=username, password=password, use_cache=True) else: f = h5pyd.File(domain, mode='r', endpoint=endpoint, username=username, password=password, use_cache=True) except IOError as oe: if oe.errno in (404, 410): # Not Found sys.exit("domain: {} not found".format(domain)) elif oe.errno == 401: # Unauthorized sys.exit("Authorization failure") elif oe.errno == 403: # Forbidden sys.exit("Not allowed") else: sys.exit("Unexpected error: {}".format(oe)) timestamp = datetime.fromtimestamp(int(f.modified)) if is_folder: print("folder: {}".format(domain)) print(" owner: {}".format(f.owner)) print(" last modified: {}".format(timestamp)) else: # report HDF objects (groups, datasets, and named datatypes) vs. allocated chunks num_objects = f.num_groups + f.num_datatypes + f.num_datasets num_chunks = f.num_objects - num_objects print("domain: {}".format(domain)) print(" owner: {}".format(f.owner)) print(" id: {}".format(f.id.id)) print(" last modified: {}".format(timestamp)) print(" total_size: {}".format(format_size(f.total_size))) print(" allocated_bytes: {}".format(format_size(f.allocated_bytes))) print(" num objects: {}".format(num_objects)) print(" num chunks: {}".format(num_chunks)) f.close()
def getFile(domain): username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] bucket = cfg["hs_bucket"] fh = h5py.File(domain, mode='r', endpoint=endpoint, username=username, password=password, bucket=bucket, use_cache=True) return fh
def __repr__(self): with h5pyd.File(self.domain, 'r', endpoint=self.endpoint) as hf: grp = hf[self.top_level_path] var_names = [key for key in grp] footer = '' if 'footer' not in grp.attrs else grp.attrs['footer'] name_strs = '\n'.join(var_names) version = '' if self.version is None else f'-- v{self.version}' return f"""{self.top_level_path} {version}
def get_region_avg(domain, bounds): f = h5pyd.File(domain, "r") tasmin_dset = f["tasmin"] # get start/end index for latitude bounds lat_dset = f["lat"] lat_arr = lat_dset[:] lat_index_range = [None, None] for i in range(lat_arr.shape[0]): if lat_index_range[0] is None and lat_arr[i] >= bounds[0]: lat_index_range[0] = i if lat_index_range[1] is None or lat_arr[i] < bounds[1]: lat_index_range[1] = i # make sure there is at least one element to select if lat_index_range[1] == lat_index_range[0]: lat_index_range[1] = lat_index_range[0] + 1 print("lat_index_range:", lat_index_range) # get start/end index for longitude bounds lon_index_range = [None, None] lon_dset = f["lon"] lon_arr = lon_dset[:] for i in range(lon_arr.shape[0]): if lon_index_range[0] is None and lon_arr[i] >= bounds[2]: lon_index_range[0] = i if lon_index_range[1] is None or lon_arr[i] < bounds[3]: lon_index_range[1] = i # make sure there is at least one element to select if lon_index_range[1] == lon_index_range[0]: lon_index_range[1] = lon_index_range[0] + 1 print("lon_index_range:", lon_index_range) # compute time index time_dset = f["time"] num_slices = time_dset.shape[0] time_index = int(time_dset[0] // 30) avg_arr = np.zeros((num_slices, ), dtype="f4") for i in range(num_slices): arr = tasmin_dset[i, lat_index_range[0]:lat_index_range[1], lon_index_range[0]:lon_index_range[1]] count = 0 sum = 0.0 for j in range(arr.shape[0]): for k in range(arr.shape[1]): if arr[j, k] < 999.0: sum += arr[j, k] count += 1 if count > 0: avg = sum / count avg_arr[i] = avg return (time_index, avg_arr)
def open_array(self, mode="r"): mode = "r" import h5pyd try: with h5pyd.File(self.domain_name, mode, self.host) as f: dataset = f[self.key] yield dataset except IOError, e: print(self.domain_name) raise e
def main(): filepath = "/home/john/sample/ghcn.h5" # 82 GB station_id = "US1WAKG0020" # seattle station f = h5py.File(filepath, 'r') dset = f["/dset"] print("nrows:", dset.shape[0]) query = "station == b'{}'".format(station_id) result = dset.read_where(query) print("result: {} rows".format(len(result))) f.close()
def createFile(domain): #print("createFile", domain) username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] owner = None if "hs_owner" in cfg: owner=cfg["hs_owner"] fh = h5py.File(domain, mode='x', endpoint=endpoint, username=username, password=password, owner=owner) return fh
def __init__(self, h5_file, unscale=True, str_decode=True, group=None, mode='r', hsds=False, hsds_kwargs=None): """ Parameters ---------- h5_file : str Path to .h5 resource file unscale : bool, optional Boolean flag to automatically unscale variables on extraction, by default True str_decode : bool, optional Boolean flag to decode the bytestring meta data into normal strings. Setting this to False will speed up the meta data read, by default True group : str, optional Group within .h5 resource file to open, by default None mode : str, optional Mode to instantiate h5py.File instance, by default 'r' hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS behind HSDS, by default False hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None """ self.h5_file = h5_file if hsds: if mode != 'r': raise IOError('Cannot write to files accessed vias HSDS!') import h5pyd if hsds_kwargs is None: hsds_kwargs = {} self._h5 = h5pyd.File(self.h5_file, mode='r', use_cache=False, **hsds_kwargs) else: try: self._h5 = h5py.File(self.h5_file, mode=mode) except Exception as e: msg = ('Could not open file in mode "{}": "{}"' .format(mode, self.h5_file)) raise IOError(msg) from e self._group = group self._unscale = unscale self._meta = None self._time_index = None self._lat_lon = None self._str_decode = str_decode self._attrs = None self._shapes = None self._chunks = None self._dtypes = None self._i = 0
def createFile(domain, linked_domain=None): #print("createFile", domain) username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] bucket = cfg["hs_bucket"] owner = None if "hs_owner" in cfg: owner=cfg["hs_owner"] fh = h5pyd.File(domain, mode='x', endpoint=endpoint, username=username, password=password, bucket=bucket, owner=owner, linked_domain=linked_domain) return fh
def createFile(domain): #print("createFile", domain) username = cfg["hs_username"] password = cfg["hs_password"] endpoint = cfg["hs_endpoint"] fh = h5py.File(domain, mode='x', endpoint=endpoint, username=username, password=password) return fh
def getWindData(name, start_i, end_i, y, x): numberTries = 0 while numberTries < 20: try: with h5pyd.File("/nrel/wtk-us.h5", 'r') as hf: return hf[name][start_i:end_i, y, x] except: time.sleep(0.2) numberTries += 1 log.error("Wind data download timed out") raise ValueError('Wind Dataset Timed Out')
def File(path, mode='a', **kwargs): """Either h5py.File or h5pyd.File depending on path.""" if isinstance(path, str): if path.startswith(('http://', 'https://', 'hdf5://')): if no_h5pyd: raise ImportError( 'h5pyd package is required for: {}'.format(path)) return h5pyd.File(path, mode, **kwargs) if no_h5py: raise ImportError('h5py package is required for {}'.format(path)) return h5py.File(path, mode, **kwargs)
def main(): loglevel = logging.DEBUG # Use logging.ERROR to hide log messages logging.basicConfig(format='%(asctime)s %(message)s', level=loglevel) filepath = "/home/hdf/sample/ghcn.h5" # 82 GB station_id = "US1WAKG0020" # seattle station f = h5py.File(filepath, 'r') dset = f["/dset"] print("nrows:", dset.shape[0]) query = "station == b'{}'".format(station_id) result = dset.read_where(query) print("result: {} rows".format(len(result))) f.close()
def _query_timeseries(hour=None, x=None, y=None, dataset=None, max_hours=None): """ Accepts a GeoDataFrame of hdf5 grid points attributed with (y, x) coordinates and query the hsds interface for an hourly time-series values for a dataset specified by the user. This implementation will treat the hourly time-series as literal and will not do a regression to estimate dataset values :param gdf: :param timeseries: :param datasets: :return: """ global f if not isinstance(hour, list): logger.debug( "Building a sequence from user-specified single scalar value") all_hours = _bootstrap_normal_dist(n_samples=N_BOOTSTRAP_REPLICATES, mean=hour, variance=30, fun=round) else: logger.debug( "Assuming an explicit list of hours from user-specified input") all_hours = hour _kwargs = {} _kwargs["mean"] = hour _kwargs["variance"] = DOY_VARIANCE_PARAMETER _kwargs["fun"] = round _kwargs["n_samples"] = N_BOOTSTRAP_REPLICATES bs_hourlies = [int(h) for h in unique(_bootstrap_normal_dist(**_kwargs))] bs_hourlies = array(bs_hourlies)[array(bs_hourlies) >= 0] bs_hourlies = array(bs_hourlies)[array(bs_hourlies) < max_hours] logger.debug("Query: z=" + str(hour) + "; x=" + str(x) + "; y=" + str(y)) ret = DataFrame() ret[0] = bs_hourlies try: ret[1] = f[dataset][[(z, x, y) for z in bs_hourlies]] except Exception: logger.debug("Dropped our connection -- picking up where we left off") f.close() time.sleep(10) globals()["f"] = h5.File("/nrel/wtk-us.h5", "r", bucket="nrel-pds-hsds") return _query_timeseries(hour, x, y, dataset, max_hours) return ret
def __init__(self, hsds_path, preload=False): """ Parameters ---------- hsds_path : h5pyd.File instance """ self._h5d = h5pyd.File(hsds_path, mode='r') if preload: self.preload() else: self._time_index = None self._meta = None self._tree = None
def diff(filename): verbose = True print(f"got filename: {filename} from {inventory_domain}") # got filename: data/hdf5test/snp500.h5 from /home/john/bucketloader/inventory.h5 s3path = f"s3://{src_bucket}/{filename}" print(f"using s3path: {s3path}") tgt_path = tgt_folder + filename print(f"tgt_path: {tgt_path}") # make sure the local hsds is up (if being used) if hsds_local: endpoint = hsds_local else: endpoint = hsds_global print("running diff {} and {}".format(s3path, tgt_path)) s3 = s3fs.S3FileSystem() try: fin = h5py.File(s3.open(s3path, "rb"), "r") except IOError as ioe: logging.error("Error opening s3path {}: {}".format(s3path, ioe)) raise # open domain try: fout = h5pyd.File(tgt_path, 'r', endpoint=endpoint, username=username, password=password, bucket=tgt_bucket) except IOError as ioe: if ioe.errno == 404: logging.error("Domain: {} not found".format(tgt_path)) elif ioe.errno == 403: logging.error("No write access to domain: {}".format(tgt_path)) else: logging.error("Error creating file {}: {}".format(tgt_path, ioe)) raise # do the actual diff result = "" try: result = diff_file(fin, fout, verbose=verbose) except IOError as ioe: logging.error("load_file error: {}".format(ioe)) raise return result
def test_annual_solar_mean(): ''' This function tests the annual_solar_mean function. It checks the type of the resulting value as well as the type of the location index input. ''' f = h5pyd.File("/nrel/wtk-us.h5", 'r') lansing_idx = (1062, 1938) result = solar_handling.annual_solar_mean(f, lansing_idx) energy_type = np.float64 assert type(result) is energy_type,\ "Average energy type output is incorrect" assert type(lansing_idx) is tuple,\ "Input location index is not a tuple"
def watch_bucket(): print("watch_bucket") f = h5pyd.File(inventory_domain, "a", bucket=HSDS_BUCKET) table = f["inventory"] for key in keys(CHAP10_BUCKET): condition = f"filename == b'{key}'" matches = table.read_where(condition, limit=1) if len(matches) == 0: print(f"not found, adding filename: {key}") row = (key, 0, 0) table.append([ row, ]) else: pass # filename found
def prime_h5rest_source(urinm, vname, tms=None): '''Simple routine to open and check for a var and return time step datetimes, if requested, and a fill value from h5sevr or hsds rest service''' hfd = h5pyd.File(urinm[1], "r", endpoint=urinm[0]) if vname not in hfd.keys(): logging.error('ERROR : %s not available in %s' % (urinm[2] + '/' + urinm[1], vname)) sys.exit(1) fillv = hfd[vname].attrs['_FillValue'] # WARN, this is assumed... if tms != None: # assume time and the required units exists for this example, then convert to "readable" datetimes dtms = nums_2_date(hfd['time'][:], hfd['time'].attrs['units'], hfd['time'].attrs['calendar']) return hfd, fillv, dtms else: return hfd, fillv
def __init__(self, h5_file, hsds=False): """ Parameters ---------- h5_file : str .h5 file containing exclusion layers and techmap hsds : bool Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS behind HSDS """ self.h5_file = h5_file if hsds: import h5pyd self._h5 = h5pyd.File(self.h5_file, 'r') else: self._h5 = h5py.File(self.h5_file, 'r') self._iarr = None