def merge_data_filenames(filelist, outputfile): print "The following input files were found:\n" for f in filelist: print "\t - %s" % f data = OrderedDict() attrs = OrderedDict() for f in filelist: data[f], attrs[f] = hdf5.load(f) hdf5.save(outputfile, *merge_data(data, attrs)) msg.info("Done")
def get_data(filename, match, keys): """Load file, check if contains match, update datasets based on command line options. Return data dictionary. Keyword arguments: filename -- input hdf5 file match -- common key use to order data keys -- user-chosen datasets to save """ data = hdf5.load(filename) print("\nThe following datasets were found in %s:\n" % filename) msg.list_dataset(data) check.key_exists(match, data, filename) if keys: msg.info("Using only: " + keys) update_data(data, [k.strip() for k in keys.split(',')], args.match) return data
def get_data(filename, match, keys): """Load file, check if contains match, update datasets based on command line options. Return data dictionary. Keyword arguments: filename -- input hdf5 file match -- common key use to order data keys -- user-chosen datasets to save """ data = hdf5.load(filename) print "\nThe following datasets were found in %s:\n" % filename msg.list_dataset(data) check.key_exists(match, data, filename) if keys: msg.info("Using only: " + keys) update_data(data, [k.strip() for k in keys.split(',')], args.match) return data
#!/usr/bin/env python """ Print info on datasets in hdf5 file. """ import sys sys.path.append('..') import hdf5 import msg if __name__ == '__main__': if len(sys.argv) != 2: print("usage: ./print file") sys.exit(1) print("\nThe following datasets were found in %s:\n" % sys.argv[1]) msg.list_dataset(hdf5.load(sys.argv[1]))
""" f = open(filename, 'w') for fn in filelist: print >> f, os.path.abspath(fn) f.close() if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: SPLIT") args = parser() data = hdf5.load(args.input) print "The following datasets were found in %s:\n" % args.input print "data=", data msg.list_dataset(data) filelist = generate_filelist( args.prefix or os.path.splitext(args.input)[0], check.get_size(data), int(args.size)) print "\nSaving output files woww:\n" print "iteritmes", filelist.iteritems() for f, r in filelist.iteritems(): print "i am inside split now" msg.list_fileinfo(f, r) print "r[0] r[1]", r[0], r[1] hdf5.save_subset(f, data, r[0], r[1])
for key in data_list[f]: data[key] = np.append(data[key], data_list[f][key], axis=0) return data if __name__ == '__main__': print("HDF5 MANIPULATOR: MERGE") args = parser() filelist = get_filelist([f.strip() for f in args.input_files.split(',')]) if not filelist: msg.error("No files matching --input were found.") sys.exit(1) print "The following input files were found:\n" for f in filelist: print "\t - %s" % f data = OrderedDict() for f in filelist: data[f] = hdf5.load(f) hdf5.save(args.output, merge_data(data)) msg.info("Done")
def load_h5(filepath: Union[str, os.PathLike], name: str = '/', columns: Union[Sequence[str], 're.Pattern', Callable[..., Sequence[str]]] = '', format=None, fixblocks: bool = False, drop_short: bool = False, verbose=0, **kwargs) -> Union['Dataset', 'Struct']: """ Load from h5 file and flip hdf5.io objects to riptable structures. In some h5 files, the arrays are saved as rows in "blocks". If `fixblocks` is ``True``, this routine will transpose the rows in the blocks. Parameters ---------- filepath : str or os.PathLike The path to the HDF5 file to load. name : str Set to table name, defaults to '/'. columns : sequence of str or re.Pattern or callable, defaults to '' Return the given subset of columns, or those matching regex. If a function is passed, it will be called with column names, dtypes and shapes, and should return a subset of column names. Passing an empty string (the default) loads all columns. format : hdf5.Format TODO, defaults to hdf5.Format.NDARRAY fixblocks : bool True will transpose the rows when the H5 file are as ???, defaults to False. drop_short : bool Set to True to drop short rows and never return a Struct, defaults to False. verbose TODO Returns ------- Dataset or Struct A `Dataset` or `Struct` with all workspace contents. Notes ----- block<#>_items is a list of column names (bytes) block<#>_values is a numpy array of numpy array (rows) columns (for riptable) can be generated by zipping names from the list with transposed columns axis0 appears to be all column names - not sure what to do with this also what is axis1? should it get added like the other columns? """ import hdf5 if format is None: format = hdf5.Format.NDARRAY if verbose > 0: print(f'starting h5 load {filepath}') # TEMP: Until hdf5.load() implements support for path-like objects, force conversion to str. filepath = os.fspath(filepath) ws = hdf5.load(filepath, name=name, columns=columns, format=format, **kwargs) if verbose > 0: print(f'finished h5 load {filepath}') if isinstance(ws, dict): if verbose > 0: print( f'h5 file loaded into dictionary. Possibly returning Dataset from dictionary, otherwise Struct.' ) return _possibly_create_dataset(ws) ws = h5io_to_struct(ws) if fixblocks: ws = ws[0] final_dict = {} for k, v in ws.items(): if k.endswith('_items'): names = v.astype('U') rows = ws[k[:-5] + 'values'] t_dict = dict(zip(names, rows.transpose())) for t_k, t_v in t_dict.items(): final_dict[t_k] = t_v ws = TypeRegister.Struct(final_dict) if drop_short: # try to make a dataset rownum_set = {len(ws[c]) for c in ws} maxrow = max(rownum_set) print("drop short was set! max was ", maxrow) final_dict = {} # build a new dictionary with only columns of the max length for k, v in ws.items(): if len(v) == maxrow: final_dict[k] = v else: warnings.warn( f"load_h5: drop_short, dropping col {k!r} with len {len(v)} vs {maxrow}" ) ws = TypeRegister.Dataset(final_dict) return ws
Check if two hdf5 files are the same. """ import sys import numpy as np sys.path.append('..') import hdf5 import msg import check if __name__ == '__main__': if len(sys.argv) != 3: print("usage: ./diff file1 file2") sys.exit(1) data1 = hdf5.load(sys.argv[1]) data2 = hdf5.load(sys.argv[2]) print("\nThe following datasets were found in %s:\n" % sys.argv[1]) msg.list_dataset(data1) print("\nThe following datasets were found in %s:\n" % sys.argv[2]) msg.list_dataset(data2) check.check_keys(data1, data2) if check.get_size(data1) != check.get_size(data2): msg.error("Different number of entries.") sys.exit(1) check.check_shapes(data1, data2)
""" f = open(filename, 'w') for fn in filelist: print >>f, os.path.abspath(fn) f.close() if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: SPLIT") args = parser() data = hdf5.load(args.input) print "The following datasets were found in %s:\n" % args.input msg.list_dataset(data) filelist = generate_filelist( args.prefix or os.path.splitext(args.input)[0], check.get_size(data), int(args.size)) print "\nSaving output files:\n" for f, r in filelist.iteritems(): msg.list_fileinfo(f, r) hdf5.save_subset(f, data, r[0], r[1]) if args.filelist:
if not len(data): msg.error("No datasets to process.") sys.exit(1) check.get_size(data) for key in keys: if key not in data.keys(): msg.warning("%s requested, but not found." % key) if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: EXTRACT") args = parser() data = hdf5.load(args.input, gkey=args.keys.split(',')) print "The following datasets were found in %s:\n" % args.input msg.list_dataset(data) # update_data(data, [k.strip() for k in args.keys.split(',')]) print "\nThe following dataset will be saved in %s:\n" % args.output msg.list_dataset(data) hdf5.save(args.output, data) msg.info("Done")