def update_data(data, keys, skip=None): """Remove not requested datasets. Keyword arguments: data -- dicionary with data keys -- user-requested keys skip -- the key not to delete """ loopkeys = list(data.keys()) for key in loopkeys: if key == skip: continue if key not in keys: del data[key] if not len(data): msg.error("No datasets to process.") sys.exit(1) check.get_size(data) for key in keys: if key not in data.keys(): msg.warning("%s requested, but not found." % key)
def update_data(data, keys, skip=None): """Remove not requested datasets. Keyword arguments: data -- dicionary with data keys -- user-requested keys skip -- the key not to delete """ for key in data.keys(): if key == skip: continue if key not in keys: del data[key] if not len(data): msg.error("No datasets to process.") sys.exit(1) check.get_size(data) for key in keys: if key not in data.keys(): msg.warning("%s requested, but not found." % key)
def merge_data(data_list, attrs_list): """Merge dictionaries with data. Keyword arguments: data_list -- the dictionary with data dictionaries """ data = None attrs = None for f in data_list: size = check.get_size(data_list[f]) if not data: print "\nThe following datasets were found in %s:\n" % f msg.list_dataset(data_list[f]) data = data_list[f] attrs = attrs_list[f] else: print "\nAdding %(n)d entries from %(f)s" % {"n": size, "f": f} check.check_keys(data, data_list[f]) check.check_shapes(data, data_list[f]) for key in data_list[f]: data[key] = np.append(data[key], data_list[f][key], axis=0) attrs['n_events'] += attrs_list[f]['n_events'] return data, attrs
def get_size(filelist): """Get total size of datasets; return size and ranges per file. Keyword arguments: filelist -- the list of input files """ total_size = 0 ranges = {} for f in filelist: data = h5py.File(f, 'r') size = check.get_size(data) ranges[f] = [total_size, total_size + size] total_size = total_size + size data.close() return total_size, ranges
import h5py import check from combine_big import load from split import generate_filelist from split import save_filelist if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: SPLIT") args = parser() data = load(args.input) filelist = generate_filelist( args.prefix or os.path.splitext(args.input)[0], check.get_size(data), int(args.size)) print "\nSaving output files:\n" for f, r in filelist.iteritems(): msg.list_fileinfo(f, r) hdf5.save_subset_big(f, data, r[0], r[1]) if args.filelist: save_filelist(args.filelist, filelist.keys()) data.close() msg.info("Done")
f.close() if __name__ == '__main__': msg.box("HDF5 MANIPULATOR: SPLIT") args = parser() data = hdf5.load(args.input) print "The following datasets were found in %s:\n" % args.input print "data=", data msg.list_dataset(data) filelist = generate_filelist( args.prefix or os.path.splitext(args.input)[0], check.get_size(data), int(args.size)) print "\nSaving output files woww:\n" print "iteritmes", filelist.iteritems() for f, r in filelist.iteritems(): print "i am inside split now" msg.list_fileinfo(f, r) print "r[0] r[1]", r[0], r[1] hdf5.save_subset(f, data, r[0], r[1]) if args.filelist: save_filelist(args.filelist, filelist.keys()) msg.info("Done")
if __name__ == '__main__': if len(sys.argv) < 4: usage() train_frac, val_frac = get_fractions() if train_frac + val_frac > 1.0: msg.error("Total fraction must be <= 1.0") sys.exit(1) f = h5py.File(sys.argv[1], 'r+') print "\nThe following datasets were found in %s:\n" % sys.argv[1] msg.list_dataset(f) N = check.get_size(f) nof_train = int(train_frac * N) nof_val = int(val_frac * N) nof_test = N - nof_train - nof_val print "\nThe following split will be used:\n" print "\t - training: %d entries" % nof_train print "\t - validation: %d entries" % nof_val print "\t - testing: %d entries" % nof_test train_dict = {name: (0, nof_train) for name in f.keys()} valid_dict = {name: (nof_train, nof_train + nof_val) for name in f.keys()} test_dict = {name: (nof_train + nof_val, N) for name in f.keys()}
if __name__ == '__main__': if len(sys.argv) < 4: usage() train_frac, val_frac = get_fractions() if train_frac + val_frac > 1.0: msg.error("Total fraction must be <= 1.0") sys.exit(1) f = h5py.File(sys.argv[1], 'r+') print "\nThe following datasets were found in %s:\n" % sys.argv[1] msg.list_dataset(f) N = check.get_size(f) nof_train = int(train_frac * N) nof_val = int(val_frac * N) nof_test = N - nof_train - nof_val print "\nThe following split will be used:\n" print "\t - training: %d entries" % nof_train print "\t - validation: %d entries" % nof_val print "\t - testing: %d entries" % nof_test train_dict = {name: (0, nof_train) for name in f.keys()} valid_dict = {name: (nof_train, nof_train + nof_val) for name in f.keys()} test_dict = {name: (nof_train + nof_val, N) for name in f.keys()} split_dict = {'train': train_dict, 'valid': valid_dict, 'test': test_dict}
import msg import check if __name__ == '__main__': if len(sys.argv) != 3: print("usage: ./diff file1 file2") sys.exit(1) data1 = hdf5.load(sys.argv[1]) data2 = hdf5.load(sys.argv[2]) print("\nThe following datasets were found in %s:\n" % sys.argv[1]) msg.list_dataset(data1) print("\nThe following datasets were found in %s:\n" % sys.argv[2]) msg.list_dataset(data2) check.check_keys(data1, data2) if check.get_size(data1) != check.get_size(data2): msg.error("Different number of entries.") sys.exit(1) check.check_shapes(data1, data2) for key in data1: if not np.equal(data1[key], data2[key]).all(): msg.error("Different entries for dataset: %s" % key) sys.exit(1) msg.info("Files are the same.")