Ejemplo n.º 1
0
def update_data(data, keys, skip=None):
    """Remove not requested datasets.

    Keyword arguments:
    data -- dicionary with data
    keys -- user-requested keys
    skip -- the key not to delete
    """

    loopkeys = list(data.keys())
    for key in loopkeys:
        if key == skip:
            continue
        if key not in keys:
            del data[key]

    if not len(data):
        msg.error("No datasets to process.")
        sys.exit(1)

    check.get_size(data)

    for key in keys:
        if key not in data.keys():
            msg.warning("%s requested, but not found." % key)
Ejemplo n.º 2
0
def update_data(data, keys, skip=None):

    """Remove not requested datasets.

    Keyword arguments:
    data -- dicionary with data
    keys -- user-requested keys
    skip -- the key not to delete
    """

    for key in data.keys():
        if key == skip:
            continue
        if key not in keys:
            del data[key]

    if not len(data):
        msg.error("No datasets to process.")
        sys.exit(1)

    check.get_size(data)

    for key in keys:
        if key not in data.keys():
            msg.warning("%s requested, but not found." % key)
Ejemplo n.º 3
0
def merge_data(data_list, attrs_list):
    """Merge dictionaries with data.

    Keyword arguments:
    data_list -- the dictionary with data dictionaries
    """

    data = None
    attrs = None

    for f in data_list:
        size = check.get_size(data_list[f])
        if not data:
            print "\nThe following datasets were found in %s:\n" % f
            msg.list_dataset(data_list[f])
            data = data_list[f]
            attrs = attrs_list[f]
        else:
            print "\nAdding %(n)d entries from %(f)s" % {"n": size, "f": f}
            check.check_keys(data, data_list[f])
            check.check_shapes(data, data_list[f])
            for key in data_list[f]:
                data[key] = np.append(data[key], data_list[f][key], axis=0)
            attrs['n_events'] += attrs_list[f]['n_events']

    return data, attrs
Ejemplo n.º 4
0
def get_size(filelist):

    """Get total size of datasets; return size and ranges per file.

    Keyword arguments:
    filelist -- the list of input files
    """

    total_size = 0
    ranges = {}

    for f in filelist:
        data = h5py.File(f, 'r')
        size = check.get_size(data)
        ranges[f] = [total_size, total_size + size]
        total_size = total_size + size
        data.close()

    return total_size, ranges
Ejemplo n.º 5
0
def get_size(filelist):

    """Get total size of datasets; return size and ranges per file.

    Keyword arguments:
    filelist -- the list of input files
    """

    total_size = 0
    ranges = {}

    for f in filelist:
        data = h5py.File(f, 'r')
        size = check.get_size(data)
        ranges[f] = [total_size, total_size + size]
        total_size = total_size + size
        data.close()

    return total_size, ranges
Ejemplo n.º 6
0
import h5py
import check
from combine_big import load
from split import generate_filelist
from split import save_filelist

if __name__ == '__main__':

    msg.box("HDF5 MANIPULATOR: SPLIT")

    args = parser()

    data = load(args.input)

    filelist = generate_filelist(
        args.prefix or os.path.splitext(args.input)[0],
        check.get_size(data), int(args.size))

    print "\nSaving output files:\n"

    for f, r in filelist.iteritems():
        msg.list_fileinfo(f, r)
        hdf5.save_subset_big(f, data, r[0], r[1])

    if args.filelist:
        save_filelist(args.filelist, filelist.keys())

    data.close()

    msg.info("Done")
Ejemplo n.º 7
0
    f.close()


if __name__ == '__main__':

    msg.box("HDF5 MANIPULATOR: SPLIT")

    args = parser()
    data = hdf5.load(args.input)

    print "The following datasets were found in %s:\n" % args.input
    print "data=", data
    msg.list_dataset(data)
    filelist = generate_filelist(
        args.prefix or os.path.splitext(args.input)[0], check.get_size(data),
        int(args.size))

    print "\nSaving output files woww:\n"
    print "iteritmes", filelist.iteritems()
    for f, r in filelist.iteritems():
        print "i am inside split now"
        msg.list_fileinfo(f, r)
        print "r[0] r[1]", r[0], r[1]
        hdf5.save_subset(f, data, r[0], r[1])

    if args.filelist:
        save_filelist(args.filelist, filelist.keys())

    msg.info("Done")
Ejemplo n.º 8
0
if __name__ == '__main__':
    if len(sys.argv) < 4:
        usage()

    train_frac, val_frac = get_fractions()

    if train_frac + val_frac > 1.0:
        msg.error("Total fraction must be <= 1.0")
        sys.exit(1)

    f = h5py.File(sys.argv[1], 'r+')

    print "\nThe following datasets were found in %s:\n" % sys.argv[1]
    msg.list_dataset(f)

    N = check.get_size(f)
    nof_train = int(train_frac * N)
    nof_val = int(val_frac * N)
    nof_test = N - nof_train - nof_val

    print "\nThe following split will be used:\n"
    print "\t - training: %d entries" % nof_train
    print "\t - validation: %d entries" % nof_val
    print "\t - testing: %d entries" % nof_test

    train_dict = {name: (0, nof_train)
                  for name in f.keys()}
    valid_dict = {name: (nof_train, nof_train + nof_val)
                  for name in f.keys()}
    test_dict = {name: (nof_train + nof_val, N)
                 for name in f.keys()}
Ejemplo n.º 9
0
if __name__ == '__main__':
    if len(sys.argv) < 4:
        usage()

    train_frac, val_frac = get_fractions()

    if train_frac + val_frac > 1.0:
        msg.error("Total fraction must be <= 1.0")
        sys.exit(1)

    f = h5py.File(sys.argv[1], 'r+')

    print "\nThe following datasets were found in %s:\n" % sys.argv[1]
    msg.list_dataset(f)

    N = check.get_size(f)
    nof_train = int(train_frac * N)
    nof_val = int(val_frac * N)
    nof_test = N - nof_train - nof_val

    print "\nThe following split will be used:\n"
    print "\t - training: %d entries" % nof_train
    print "\t - validation: %d entries" % nof_val
    print "\t - testing: %d entries" % nof_test

    train_dict = {name: (0, nof_train) for name in f.keys()}
    valid_dict = {name: (nof_train, nof_train + nof_val) for name in f.keys()}
    test_dict = {name: (nof_train + nof_val, N) for name in f.keys()}

    split_dict = {'train': train_dict, 'valid': valid_dict, 'test': test_dict}
Ejemplo n.º 10
0
import msg
import check

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("usage: ./diff file1 file2")
        sys.exit(1)

    data1 = hdf5.load(sys.argv[1])
    data2 = hdf5.load(sys.argv[2])

    print("\nThe following datasets were found in %s:\n" % sys.argv[1])
    msg.list_dataset(data1)
    print("\nThe following datasets were found in %s:\n" % sys.argv[2])
    msg.list_dataset(data2)

    check.check_keys(data1, data2)

    if check.get_size(data1) != check.get_size(data2):
        msg.error("Different number of entries.")
        sys.exit(1)

    check.check_shapes(data1, data2)

    for key in data1:
        if not np.equal(data1[key], data2[key]).all():
            msg.error("Different entries for dataset: %s" % key)
            sys.exit(1)

    msg.info("Files are the same.")