def get_keys_stops_divisions(path, key, stop, sorted_index, chunksize): """ Get the "keys" or group identifiers which match the given key, which can contain wildcards. This uses the hdf file identified by the given path. Also get the index of the last row of data for each matched key. """ with pd.HDFStore(path, mode=mode) as hdf: import glob from distutils.version import LooseVersion if LooseVersion(pd.__version__) >= LooseVersion("0.24"): if not glob.has_magic(key): keys = [key] else: keys = [k for k in hdf.keys() if fnmatch(k, key)] # https://github.com/dask/dask/issues/5934 # TODO: remove this part if/when pandas copes with all keys keys.extend( n._v_pathname for n in hdf._handle.walk_nodes("/", classname="Table") if fnmatch(n._v_pathname, key) and n._v_name != u"table" and n._v_pathname not in keys) else: # TODO: remove if we require pandas >= 0.24 keys = [k for k in hdf.keys() if fnmatch(k, key)] stops = [] divisions = [] for k in keys: storer = hdf.get_storer(k) if storer.format_type != "table": raise TypeError(dont_use_fixed_error_message) if stop is None: stops.append(storer.nrows) elif stop > storer.nrows: raise ValueError("Stop keyword exceeds dataset number " "of rows ({})".format(storer.nrows)) else: stops.append(stop) if sorted_index: division = [ storer.read_column("index", start=start, stop=start + 1)[0] for start in range(0, storer.nrows, chunksize) ] division_end = storer.read_column("index", start=storer.nrows - 1, stop=storer.nrows)[0] division.append(division_end) divisions.append(division) else: divisions.append(None) return keys, stops, divisions
def _expand_key(key, hdf): import glob if not glob.has_magic(key): keys = [key] else: keys = [k for k in hdf.keys() if fnmatch(k, key)] # https://github.com/dask/dask/issues/5934 # TODO: remove this part if/when pandas copes with all keys keys.extend(n._v_pathname for n in hdf._handle.walk_nodes("/", classname="Table") if fnmatch(n._v_pathname, key) and n._v_name != "table" and n._v_pathname not in keys) return keys