Example #1
0
def merge_data_filenames(filelist, outputfile):
    print "The following input files were found:\n"

    for f in filelist:
        print "\t - %s" % f

    data = OrderedDict()
    attrs = OrderedDict()

    for f in filelist:
        data[f], attrs[f] = hdf5.load(f)

    hdf5.save(outputfile, *merge_data(data, attrs))

    msg.info("Done")
Example #2
0
def get_data(filename, match, keys):
    """Load file, check if contains match,
    update datasets based on command line options. Return data dictionary.

    Keyword arguments:
    filename -- input hdf5 file
    match -- common key use to order data
    keys -- user-chosen datasets to save
    """

    data = hdf5.load(filename)

    print("\nThe following datasets were found in %s:\n" % filename)
    msg.list_dataset(data)

    check.key_exists(match, data, filename)

    if keys:
        msg.info("Using only: " + keys)
        update_data(data, [k.strip() for k in keys.split(',')], args.match)

    return data
Example #3
0
def get_data(filename, match, keys):

    """Load file, check if contains match,
    update datasets based on command line options. Return data dictionary.

    Keyword arguments:
    filename -- input hdf5 file
    match -- common key use to order data
    keys -- user-chosen datasets to save
    """

    data = hdf5.load(filename)

    print "\nThe following datasets were found in %s:\n" % filename
    msg.list_dataset(data)

    check.key_exists(match, data, filename)

    if keys:
        msg.info("Using only: " + keys)
        update_data(data, [k.strip() for k in keys.split(',')], args.match)

    return data
Example #4
0
#!/usr/bin/env python
"""
Print info on datasets in hdf5 file.
"""
import sys
sys.path.append('..')
import hdf5
import msg

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print("usage: ./print file")
        sys.exit(1)

    print("\nThe following datasets were found in %s:\n" % sys.argv[1])
    msg.list_dataset(hdf5.load(sys.argv[1]))
Example #5
0
    """

    f = open(filename, 'w')

    for fn in filelist:
        print >> f, os.path.abspath(fn)

    f.close()


if __name__ == '__main__':

    msg.box("HDF5 MANIPULATOR: SPLIT")

    args = parser()
    data = hdf5.load(args.input)

    print "The following datasets were found in %s:\n" % args.input
    print "data=", data
    msg.list_dataset(data)
    filelist = generate_filelist(
        args.prefix or os.path.splitext(args.input)[0], check.get_size(data),
        int(args.size))

    print "\nSaving output files woww:\n"
    print "iteritmes", filelist.iteritems()
    for f, r in filelist.iteritems():
        print "i am inside split now"
        msg.list_fileinfo(f, r)
        print "r[0] r[1]", r[0], r[1]
        hdf5.save_subset(f, data, r[0], r[1])
Example #6
0
            for key in data_list[f]:
                data[key] = np.append(data[key], data_list[f][key], axis=0)

    return data


if __name__ == '__main__':

    print("HDF5 MANIPULATOR: MERGE")

    args = parser()

    filelist = get_filelist([f.strip() for f in args.input_files.split(',')])

    if not filelist:
        msg.error("No files matching --input were found.")
        sys.exit(1)

    print "The following input files were found:\n"

    for f in filelist:
        print "\t - %s" % f

    data = OrderedDict()

    for f in filelist:
        data[f] = hdf5.load(f)

    hdf5.save(args.output, merge_data(data))

    msg.info("Done")
def load_h5(filepath: Union[str, os.PathLike],
            name: str = '/',
            columns: Union[Sequence[str], 're.Pattern',
                           Callable[..., Sequence[str]]] = '',
            format=None,
            fixblocks: bool = False,
            drop_short: bool = False,
            verbose=0,
            **kwargs) -> Union['Dataset', 'Struct']:
    """
    Load from h5 file and flip hdf5.io objects to riptable structures.

    In some h5 files, the arrays are saved as rows in "blocks". If `fixblocks` is ``True``,
    this routine will transpose the rows in the blocks.

    Parameters
    ----------
    filepath : str or os.PathLike
        The path to the HDF5 file to load.
    name : str
        Set to table name, defaults to '/'.
    columns : sequence of str or re.Pattern or callable, defaults to ''
        Return the given subset of columns, or those matching regex.
        If a function is passed, it will be called with column names, dtypes and shapes,
        and should return a subset of column names.
        Passing an empty string (the default) loads all columns.
    format : hdf5.Format
        TODO, defaults to hdf5.Format.NDARRAY
    fixblocks : bool
        True will transpose the rows when the H5 file are as ???, defaults to False.
    drop_short : bool
        Set to True to drop short rows and never return a Struct, defaults to False.
    verbose
        TODO

    Returns
    -------
    Dataset or Struct
        A `Dataset` or `Struct` with all workspace contents.

    Notes
    -----
    block<#>_items is a list of column names (bytes)
    block<#>_values is a numpy array of numpy array (rows)
    columns (for riptable) can be generated by zipping names from the list with transposed columns

    axis0 appears to be all column names - not sure what to do with this
    also what is axis1? should it get added like the other columns?
    """
    import hdf5
    if format is None:
        format = hdf5.Format.NDARRAY

    if verbose > 0: print(f'starting h5 load {filepath}')
    # TEMP: Until hdf5.load() implements support for path-like objects, force conversion to str.
    filepath = os.fspath(filepath)
    ws = hdf5.load(filepath,
                   name=name,
                   columns=columns,
                   format=format,
                   **kwargs)
    if verbose > 0: print(f'finished h5 load {filepath}')

    if isinstance(ws, dict):
        if verbose > 0:
            print(
                f'h5 file loaded into dictionary. Possibly returning Dataset from dictionary, otherwise Struct.'
            )
        return _possibly_create_dataset(ws)

    ws = h5io_to_struct(ws)

    if fixblocks:
        ws = ws[0]
        final_dict = {}
        for k, v in ws.items():
            if k.endswith('_items'):
                names = v.astype('U')
                rows = ws[k[:-5] + 'values']
                t_dict = dict(zip(names, rows.transpose()))
                for t_k, t_v in t_dict.items():
                    final_dict[t_k] = t_v
        ws = TypeRegister.Struct(final_dict)

    if drop_short:
        # try to make a dataset
        rownum_set = {len(ws[c]) for c in ws}
        maxrow = max(rownum_set)
        print("drop short was set! max was ", maxrow)
        final_dict = {}

        # build a new dictionary with only columns of the max length
        for k, v in ws.items():
            if len(v) == maxrow:
                final_dict[k] = v
            else:
                warnings.warn(
                    f"load_h5: drop_short, dropping col {k!r} with len {len(v)} vs {maxrow}"
                )

        ws = TypeRegister.Dataset(final_dict)

    return ws
Example #8
0
Check if two hdf5 files are the same.
"""
import sys
import numpy as np

sys.path.append('..')
import hdf5
import msg
import check

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("usage: ./diff file1 file2")
        sys.exit(1)

    data1 = hdf5.load(sys.argv[1])
    data2 = hdf5.load(sys.argv[2])

    print("\nThe following datasets were found in %s:\n" % sys.argv[1])
    msg.list_dataset(data1)
    print("\nThe following datasets were found in %s:\n" % sys.argv[2])
    msg.list_dataset(data2)

    check.check_keys(data1, data2)

    if check.get_size(data1) != check.get_size(data2):
        msg.error("Different number of entries.")
        sys.exit(1)

    check.check_shapes(data1, data2)
Example #9
0
    """

    f = open(filename, 'w')

    for fn in filelist:
        print >>f, os.path.abspath(fn)

    f.close()


if __name__ == '__main__':

    msg.box("HDF5 MANIPULATOR: SPLIT")

    args = parser()
    data = hdf5.load(args.input)

    print "The following datasets were found in %s:\n" % args.input
    msg.list_dataset(data)

    filelist = generate_filelist(
        args.prefix or os.path.splitext(args.input)[0],
        check.get_size(data), int(args.size))

    print "\nSaving output files:\n"

    for f, r in filelist.iteritems():
        msg.list_fileinfo(f, r)
        hdf5.save_subset(f, data, r[0], r[1])

    if args.filelist:
Example #10
0
    if not len(data):
        msg.error("No datasets to process.")
        sys.exit(1)

    check.get_size(data)

    for key in keys:
        if key not in data.keys():
            msg.warning("%s requested, but not found." % key)


if __name__ == '__main__':

    msg.box("HDF5 MANIPULATOR: EXTRACT")

    args = parser()
    data = hdf5.load(args.input, gkey=args.keys.split(','))

    print "The following datasets were found in %s:\n" % args.input
    msg.list_dataset(data)

    # update_data(data, [k.strip() for k in args.keys.split(',')])

    print "\nThe following dataset will be saved in %s:\n" % args.output
    msg.list_dataset(data)

    hdf5.save(args.output, data)

    msg.info("Done")