Beispiel #1
0
    def test_vlen_ascii(self):
        dt = h5py.string_dtype(encoding='ascii')

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is bytes
Beispiel #2
0
    def test_vlen_utf8(self):
        dt = h5py.string_dtype()

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is str
Beispiel #3
0
    def test_vlen_utf8(self):
        dt = h5py.string_dtype()

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is six.text_type
Beispiel #4
0
    def test_fixed_utf8(self):
        dt = h5py.string_dtype(length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
Beispiel #5
0
    def test_fixed_utf8(self):
        dt = h5py.string_dtype(length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'utf-8'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
Beispiel #6
0
    def test_fixed_ascii(self):
        dt = h5py.string_dtype(encoding='ascii', length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
Beispiel #7
0
    def test_fixed_ascii(self):
        dt = h5py.string_dtype(encoding='ascii', length=10)

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length == 10
        assert h5py.check_vlen_dtype(dt) is None
Beispiel #8
0
    def test_vlen_ascii(self):
        dt = h5py.string_dtype(encoding='ascii')

        string_info = h5py.check_string_dtype(dt)
        assert string_info.encoding == 'ascii'
        assert string_info.length is None
        assert h5py.check_vlen_dtype(dt) is bytes
Beispiel #9
0
    def test_vlen_enum(self):
        fname = self.mktemp()
        arr1 = [[1], [1, 2]]
        dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i'))

        with h5py.File(fname, 'w') as f:
            df1 = f.create_dataset('test', (len(arr1), ), dtype=dt1)
            df1[:] = np.array(arr1)

        with h5py.File(fname, 'r') as f:
            df2 = f['test']
            dt2 = df2.dtype
            arr2 = [e.tolist() for e in df2[:]]

        self.assertEqual(arr1, arr2)
        self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)),
                         h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
Beispiel #10
0
    def test_vlen_enum(self):
        fname = self.mktemp()
        arr1 = [[1],[1,2]]
        dt1 = h5py.vlen_dtype(h5py.enum_dtype(dict(foo=1, bar=2), 'i'))

        with h5py.File(fname,'w') as f:
            df1 = f.create_dataset('test', (len(arr1),), dtype=dt1)
            df1[:] = np.array(arr1)

        with h5py.File(fname,'r') as f:
            df2  = f['test']
            dt2  = df2.dtype
            arr2 = [e.tolist() for e in df2[:]]

        self.assertEqual(arr1, arr2)
        self.assertEqual(h5py.check_enum_dtype(h5py.check_vlen_dtype(dt1)),
                         h5py.check_enum_dtype(h5py.check_vlen_dtype(dt2)))
Beispiel #11
0
 def _test_dset_val(zobj, hobj, hobj_info):
     if (hobj_info.type == h5py.h5o.TYPE_DATASET
             and self._checkdtype_structobjref(hobj) == (False, False)
             and h5py.check_vlen_dtype(hobj.dtype)):
         hobj = hobj.asstr()  # wrapper to read data as python str
         hval = hobj[()]
         zval = zobj[()]
         assert_array_equal(hval, zval)
Beispiel #12
0
 def _test_dset_val(zobj, hobj, hobj_info):
     if (hobj_info.type == h5py.h5o.TYPE_DATASET
             and self._checkdtype_structobjref(hobj) == (True, False)
             and not h5py.check_vlen_dtype(hobj.dtype)):
         hval = hobj[()]
         zval = zobj[()]
         for dt_name in hobj.dtype.names:
             assert_array_equal(hval[dt_name], zval[dt_name])
Beispiel #13
0
 def _test_dset_val(zobj, hobj, hobj_info):
     if (hobj_info.type == h5py.h5o.TYPE_DATASET
             and self._checkdtype_structobjref(hobj) == (False, True)
             and not h5py.check_vlen_dtype(hobj.dtype)):
         hval = hobj[()]
         zval = zobj[()]
         ref_array_func = np.frompyfunc(
             lambda x: h5py.h5i.get_name(
                 h5py.h5r.dereference(x, self.hfile.id)), 1, 1)
         if hobj.shape != ():
             hval_str = ref_array_func(hval).astype(str)
         else:
             hval_str = h5py.h5i.get_name(
                 h5py.h5r.dereference(hval, self.hfile.id))
             hval_str = hval_str.decode('utf-8')
         if self.hfile.name == '/':
             assert_array_equal(hval_str, zval)
         else:
             assert_array_equal(
                 np.frompyfunc(
                     lambda x: x
                     if x.startswith(self.hfile.name) else '', 1,
                     1)(hval_str), zval)
Beispiel #14
0
    def test_open_as_zarr_dset_values(self, request, capsys, filesbase):
        # get list of files passes by hdf5files arg
        # if hdf5files is not specified, file_list will contain generated hdf5 files
        num_files = filesbase.num_files
        file_list = filesbase.file_list[0:num_files]

        # find list of datasets in files
        if len(self.objnames) != 0:
            dset_list = [
                f[name] for name in self.objnames for f in file_list
                if (name in f and isinstance(f[name], h5py.Dataset))
            ]
            if len(dset_list) == 0:
                message = f"No given file contains {self.objnames}"
                with capsys.disabled():
                    print("\n" + message.rjust(len(request.node.nodeid)),
                          end='')
                pytest.skip(message)
        # if objnames arg is not passed, select datasets for each file
        else:
            numdset = request.config.getoption('numdataset')
            if numdset <= 0:
                pytest.skip(f"numdataset: {numdset}")

            dset_names = []

            def _get_dsets(name, info):
                nonlocal dset_names
                if info.type == h5py.h5o.TYPE_DATASET:
                    dset_names.append(name.decode('utf-8'))

            dset_list = []
            for hfile in file_list:
                dset_names = []
                h5py.h5o.visit(hfile.id, _get_dsets, info=True)
                names = dset_names[0:numdset]
                for name in names:
                    dset_list.append(hfile[name])
                    message = f"objnames not specified. open_as_zarr run with file: {hfile.filename}, dataset: {name}"
                    with capsys.disabled():
                        print("\n" + message.rjust(len(request.node.nodeid)),
                              end='')

        for dset in dset_list:
            with capsys.disabled():
                print("\n" +
                      f"file: {dset.file.filename}, dataset: {dset}  :".rjust(
                          len(request.node.nodeid)),
                      end='')
                print("\n" + f"dataset: {dset.name}, data  :".rjust(
                    len(request.node.nodeid)),
                      end='')

            # call open_as_zarr
            if not dset.dtype.hasobject:
                zarray = open_as_zarr(
                    dset)  # dataset does not have object references
            else:
                zarray = open_as_zarr(
                    dset, collectrefs=True)  # dataset with object references

            # test values when dtype is variable length
            if h5py.check_vlen_dtype(dset.dtype) is not None:
                dset_str = dset.asstr()  # wrapper to read data as python str
                assert_array_equal(dset_str, zarray)
            # test values when dtype is structured array with object reference
            elif dset.dtype.hasobject and dset.dtype.names is not None:
                hval = dset[()]
                # function to get reference target names
                ref_array_func = np.frompyfunc(
                    lambda x: h5py.h5i.get_name(
                        h5py.h5r.dereference(x, dset.file.id)), 1, 1)
                for dtname in dset.dtype.names:
                    if dset.dtype[dtname].hasobject:
                        if dset.shape != ():
                            hval_str = ref_array_func(hval[dtname]).astype(str)
                        else:
                            hval_str = h5py.h5i.get_name(
                                h5py.h5r.dereference(hval[dtname],
                                                     dset.file.id))
                            hval_str = hval_str.decode('utf-8')
                        assert_array_equal(hval_str, zarray[dtname])
            # test values when dtype is object reference
            elif dset.dtype.hasobject and dset.dtype.names is None:
                hval = dset[()]
                # function to get reference target names
                ref_array_func = np.frompyfunc(
                    lambda x: h5py.h5i.get_name(
                        h5py.h5r.dereference(x, dset.file.id)), 1, 1)
                if dset.shape != ():
                    hval_str = ref_array_func(hval).astype(str)
                else:
                    hval_str = h5py.h5i.get_name(
                        h5py.h5r.dereference(hval, dset.file.id))
                    hval_str = hval_str.decode('utf-8')
                assert_array_equal(hval_str, zarray)
            # test values when dtype is simple
            else:
                assert_array_equal(dset, zarray)

            with capsys.disabled():
                print("\n" + f"dataset: {dset.name}, attrs  :".rjust(
                    len(request.node.nodeid)),
                      end='')

            # test attrs
            for key, val in dset.attrs.items():
                assert_array_equal(val, zarray.attrs[key])

            with capsys.disabled():
                print("\n" + f"dataset: {dset.name}, fillvalue  :".rjust(
                    len(request.node.nodeid)),
                      end='')

            # test fillvalue
            # if dtype is structured array
            if dset.fillvalue is not None and dset.fillvalue.dtype.names is not None:
                if dset.fillvalue.dtype.hasobject:
                    message = f"structured array fillvalue {dset.fillvalue} with object dtype not supported."
                    with capsys.disabled():
                        print(("\n" + message).rjust(len(request.node.nodeid)),
                              end='')
                    pytest.xfail(reason=message)
                assert_array_equal(dset.fill_value, zarray.fillvalue)
            # if fillvalue is an object reference:
            elif dset.fillvalue is not None and dset.fillvalue.dtype.hasobject:
                hval_str = h5py.h5i.get_name(
                    h5py.h5r.dereference(dset.fillvalue, dset.file.id))
                hval_str = hval_str.decode('utf-8')
                assert_array_equal(hval_str, zarray.fill_value)
            # simple fillvalue
            else:
                assert_array_equal(dset.fillvalue, zarray.fill_value)
Beispiel #15
0
    def _rewrite_vlen_to_fixed(h5py_group, changed_dsets={}):
        """  Scan hdf5 file or hdf5 group object and recursively convert variable-length string dataset to fixed-length
        Args:
          h5py_group: h5py.Group or h5py.File object
        """

        if (not isinstance(h5py_group, h5py.File) and (not issubclass(
                h5py_group.file.get(h5py_group.name, getclass=True),
                h5py.Group) or not issubclass(
                    h5py_group.file.get(h5py_group.name,
                                        getclass=True,
                                        getlink=True), h5py.HardLink))):
            raise TypeError(
                f"{h5py_group} should be a h5py.File or h5py.Group as a h5py.HardLink"
            )

        # iterate through group members
        group_iter = [name for name in h5py_group.keys()]
        for name in group_iter:
            obj = h5py_group[name]

            # get group member's link class
            obj_linkclass = h5py_group.get(name, getclass=True, getlink=True)

            # Datasets
            if issubclass(h5py_group.get(name, getclass=True), h5py.Dataset):
                if issubclass(obj_linkclass, h5py.ExternalLink):
                    print(
                        f"Skipped rewriting variable-length dataset {obj.name}: External Link"
                    )
                    continue
                dset = obj

                # variable-length Datasets
                if h5py.check_vlen_dtype(
                        dset.dtype) and h5py.check_string_dtype(dset.dtype):

                    vlen_stringarr = dset[()]
                    if dset.shape == ():
                        string_lengths_ = len(vlen_stringarr)
                        length_max = string_lengths_
                    else:
                        length_max = max(
                            len(el) for el in vlen_stringarr.flatten())
                    if dset.fillvalue is not None:
                        length_max = max(length_max, len(dset.fillvalue))
                    length_max = length_max + (-length_max) % 8
                    dt_fixedlen = f'|S{length_max}'

                    if isinstance(dset.fillvalue, str):
                        dset_fillvalue = dset.fillvalue.encode('utf-8')
                    else:
                        dset_fillvalue = dset.fillvalue

                    affix_ = '_fixedlen~'
                    dset_name = dset.name
                    h5py_group.file.move(dset_name, dset_name + affix_)
                    changed_dsets[dset_name + affix_] = dset_name
                    dsetf = h5py_group.file.create_dataset_like(
                        dset_name,
                        dset,
                        dtype=dt_fixedlen,
                        fillvalue=dset_fillvalue)

                    # TO DO, copy attrs after all string dataset are moved
                    for key, val in dset.attrs.items():
                        if isinstance(
                                val,
                            (bytes, np.bool_, str, int, float, np.number)):
                            dsetf.attrs[key] = val
                        else:
                            # TO DO #
                            print(
                                f"Moving variable-length string Datasets: attribute value of type\
                                    {type(val)} is not processed. Attribute {key} of object {dsetf.name}"
                            )

                    if dsetf.shape == ():
                        if isinstance(vlen_stringarr, bytes):
                            dsetf[...] = vlen_stringarr
                        else:
                            dsetf[...] = vlen_stringarr.encode('utf-8')
                    else:
                        dsetf[...] = vlen_stringarr.astype(dt_fixedlen)

            # Groups
            elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group)
                  and not issubclass(obj_linkclass, h5py.SoftLink)):
                if issubclass(obj_linkclass, h5py.ExternalLink):
                    print(f"Group {obj.name} is not processed: External Link")
                    continue
                changed_dsets = HDF5Zarr._rewrite_vlen_to_fixed(
                    obj, changed_dsets)

        return changed_dsets
Beispiel #16
0
    def create_zarr_hierarchy(self, h5py_group, zgroup):
        """  Scan hdf5 file and recursively create zarr attributes, groups and dataset structures for accessing data
        Args:
          h5py_group: h5py.Group or h5py.File object where information is gathered from
          zgroup:     Zarr Group
        """

        if (not isinstance(h5py_group, h5py.File) and
            (not issubclass(self.file.get(
                h5py_group.name, getclass=True), h5py.Group) or not issubclass(
                    self.file.get(h5py_group.name, getclass=True,
                                  getlink=True), h5py.HardLink))):
            raise TypeError(
                f"{h5py_group} should be a h5py.File or h5py.Group as a h5py.HardLink"
            )

        self.copy_attrs_data_to_zarr_store(h5py_group, zgroup)

        # add hdf5 group address in file to self._address_dict
        self._address_dict[h5py.h5o.get_info(
            h5py_group.id).addr] = h5py_group.name

        # iterate through group members
        test_iter = [name for name in h5py_group.keys()]
        for name in test_iter:
            obj = h5py_group[name]

            # get group member's link class
            obj_linkclass = h5py_group.get(name, getclass=True, getlink=True)

            # Datasets
            # TO DO, Soft Links #
            if issubclass(h5py_group.get(name, getclass=True), h5py.Dataset):
                if issubclass(obj_linkclass, h5py.ExternalLink):
                    print(
                        f"Dataset {obj.name} is not processed: External Link")
                    continue
                dset = obj

                # number of filters
                dcpl = dset.id.get_create_plist()
                nfilters = dcpl.get_nfilters()
                if nfilters > 1:
                    # TO DO #
                    print(
                        f"Dataset {dset.name} with multiple filters is not processed"
                    )
                    continue
                elif nfilters == 1:
                    # get first filter information
                    filter_tuple = dset.id.get_create_plist().get_filter(0)
                    filter_code = filter_tuple[0]
                    if filter_code in self._hdf5_regfilters_subset and self._hdf5_regfilters_subset[
                            filter_code] is not None:
                        # TO DO
                        if filter_code == 32001:
                            # Blosc
                            blosc_names = {
                                0: 'blosclz',
                                1: 'lz4',
                                2: 'lz4hc',
                                3: 'snappy',
                                4: 'zlib',
                                5: 'zstd'
                            }
                            clevel, shuffle, cname_id = filter_tuple[2][-3:]
                            cname = blosc_names[cname_id]
                            compression = self._hdf5_regfilters_subset[
                                filter_code](cname=cname,
                                             clevel=clevel,
                                             shuffle=shuffle)
                        else:
                            compression = self._hdf5_regfilters_subset[
                                filter_code](level=filter_tuple[2])
                    else:
                        print(
                            f"Dataset {dset.name} with compression filter {filter_tuple[3]}, hdf5 filter number {filter_tuple[0]} is not processed:\
                                no compatible zarr codec")
                        continue
                else:
                    compression = None

                object_codec = None

                if dset.dtype.names is not None:
                    # Structured array with Reference dtype

                    dset_type = dset.id.get_type()
                    dt_nmembers = dset_type.get_nmembers()

                    dtype_ = []
                    dset_fillvalue = list(dset.fillvalue)
                    for dt_i in range(dt_nmembers):
                        dtname = dset.dtype.names[dt_i]
                        if dset_type.get_member_class(
                                dt_i) == h5py.h5t.REFERENCE:
                            fcid = dset.file.id.get_create_plist()
                            unit_address_size, _ = fcid.get_sizes()
                            dtype_ += [(dtname,
                                        np.dtype(f'uint{unit_address_size*8}'))
                                       ]
                            if dset.fillvalue[dt_i]:
                                dset_fillvalue[dt_i] = h5py.h5o.get_info([
                                    h5py.h5r.dereference(
                                        dset.fillvalue[dt_i], self.file.id)
                                ]).addr
                            else:
                                dset_fillvalue[dt_i] = 0
                        else:
                            dtype_ += [(dtname, dset.dtype.base[dt_i])]
                    zarray = zgroup.create_dataset(
                        dset.name,
                        shape=dset.shape,
                        dtype=dtype_,
                        chunks=dset.chunks or False,
                        fill_value=tuple(dset_fillvalue),
                        compression=compression,
                        overwrite=True)

                # variable-length Datasets
                elif h5py.check_vlen_dtype(dset.dtype):
                    if not h5py.check_string_dtype(dset.dtype):
                        print(
                            f"Dataset {dset.name} is not processed: Variable-length dataset, not string"
                        )
                        continue
                    else:
                        object_codec = VLenHDF5String()
                        zarray = zgroup.create_dataset(
                            dset.name,
                            shape=dset.shape,
                            dtype=object,
                            chunks=dset.chunks or False,
                            fill_value=dset.fillvalue,
                            compression=compression,
                            overwrite=True,
                            object_codec=object_codec)
                        dset_chunks = dset.chunks

                elif dset.dtype.hasobject:
                    # TO DO test #
                    dset_type = dset.id.get_type()

                    if dset_type.get_class() == h5py.h5t.REFERENCE:
                        fcid = dset.file.id.get_create_plist()
                        unit_address_size, _ = fcid.get_sizes()
                        dtype_ = np.dtype(f'uint{unit_address_size*8}')
                        if dset.fillvalue:
                            dset_fillvalue = h5py.h5o.get_info([
                                h5py.h5r.dereference(dset.fillvalue,
                                                     self.file.id)
                            ]).addr
                        else:
                            dset_fillvalue = 0

                        zarray = zgroup.create_dataset(
                            dset.name,
                            shape=dset.shape,
                            dtype=dtype_,
                            chunks=dset.chunks or False,
                            fill_value=dset_fillvalue,
                            compression=compression,
                            overwrite=True)

                    elif dset_type.get_class() == h5py.h5t.STD_REF_DSETREG:
                        print(
                            f"Dataset {dset.name} is not processed: Region Reference dtype"
                        )
                        continue
                    else:
                        print(
                            f"Dataset {dset.name} is not processed: Object dtype"
                        )
                        continue

                else:
                    if compression is None and (dset.chunks is None
                                                or dset.chunks == dset.shape):

                        dset_chunks = dset.chunks if dset.chunks else dset.shape
                        if dset.shape != ():
                            dset_chunks = list(dset_chunks)
                            dim_ = 0
                            ratio_ = self.max_chunksize / (
                                np.prod(dset_chunks) * dset.dtype.itemsize)
                            while ratio_ < 1:
                                chunk_dim_ = int(ratio_ * dset_chunks[dim_])
                                chunk_dim_ = chunk_dim_ if chunk_dim_ else 1
                                chunk_dim_ -= np.argmax(
                                    dset_chunks[dim_] %
                                    np.arange(chunk_dim_, chunk_dim_ // 2, -1))
                                dset_chunks[dim_] = int(chunk_dim_)
                                ratio_ = self.max_chunksize / (
                                    np.prod(dset_chunks) * dset.dtype.itemsize)
                                dim_ += 1

                            dset_chunks = tuple(dset_chunks)
                        dset_chunks = dset_chunks or None
                    else:
                        dset_chunks = dset.chunks

                    zarray = zgroup.create_dataset(dset.name,
                                                   shape=dset.shape,
                                                   dtype=dset.dtype,
                                                   chunks=dset_chunks or False,
                                                   fill_value=dset.fillvalue,
                                                   compression=compression,
                                                   overwrite=True)

                self.copy_attrs_data_to_zarr_store(dset, zarray)
                info = self.storage_info(dset, dset_chunks)

                if object_codec is not None:
                    info = self.vlen_storage_info(dset, info)

                # Store metadata
                if info:
                    info['source'] = {'uri': self.uri, 'array_name': dset.name}
                    FileChunkStore.chunks_info(zarray, info)

            # Groups
            elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group)
                  and not issubclass(obj_linkclass, h5py.SoftLink)):
                if issubclass(obj_linkclass, h5py.ExternalLink):
                    print(f"Group {obj.name} is not processed: External Link")
                    continue
                group_ = obj
                zgroup_ = self.zgroup.create_group(group_.name, overwrite=True)
                self.create_zarr_hierarchy(group_, zgroup_)

            # Groups, Soft Link
            elif (issubclass(h5py_group.get(name, getclass=True), h5py.Group)
                  and issubclass(obj_linkclass, h5py.SoftLink)):
                group_ = obj
                zgroup_ = self.zgroup.create_group(group_.name, overwrite=True)
                self.copy_attrs_data_to_zarr_store(group_, zgroup_)

                zgroup_path = zgroup_.create_group(SYMLINK, overwrite=True)
                zgroup_path.attrs[group_.name] = h5py_group.get(
                    name, getlink=True).path
Beispiel #17
0
def index_to_pandas(dset: h5py.Dataset,
                    fields: None | Sequence[str] = None) -> pd.MultiIndex:
    """Construct an MultiIndex from the passed ``index`` dataset.

    Examples
    --------
    .. testsetup:: python

        >>> from dataCAT.testing_utils import HDF5_READ as filename

    .. code:: python

        >>> from dataCAT import index_to_pandas
        >>> import h5py

        >>> filename = str(...)  # doctest: +SKIP

        # Convert the entire dataset
        >>> with h5py.File(filename, "r") as f:
        ...     dset: h5py.Dataset = f["ligand"]["index"]
        ...     index_to_pandas(dset)
        MultiIndex([('O=C=O', 'O1'),
                    ('O=C=O', 'O3'),
                    ( 'CCCO', 'O4')],
                   names=['ligand', 'ligand anchor'])

        # Convert a subset of fields
        >>> with h5py.File(filename, "r") as f:
        ...     dset = f["ligand"]["index"]
        ...     index_to_pandas(dset, fields=["ligand"])
        MultiIndex([('O=C=O',),
                    ('O=C=O',),
                    ( 'CCCO',)],
                   names=['ligand'])

    Parameters
    ----------
    dset : :class:`h5py.Dataset`
        The relevant ``index`` dataset.
    fields : :class:`Sequence[str]<collections.abc.Sequence>`
        The names of the ``index`` fields that are to-be included in the
        returned MultiIndex. If :data:`None`, include all fields.

    Returns
    -------
    :class:`pandas.MultiIndex`
        A multi-index constructed from the passed dataset.

    """
    # Fast-path for non-void-based datasets
    if dset.dtype.fields is None:
        if h5py.check_string_dtype(dset.dtype):
            ar = dset[:].astype(str)
        elif h5py.check_vlen_dtype(dset.dtype):
            ar = _vlen_to_tuples(dset[:])
        else:
            ar = dset[:]
        return pd.MultiIndex.from_arrays([ar])

    # Parse the `fields` parameter
    if fields is None:
        field_names = list(dset.dtype.fields.keys())
        iterator = ((name, f_dtype)
                    for name, (f_dtype, *_) in dset.dtype.fields.items())
    else:
        field_names = list(fields)
        iterator = ((name, dset.dtype.fields[name][0]) for name in fields)
    if len(field_names) == 0:
        raise ValueError("At least one field is required")

    fields_lst = []
    index_ar = dset[:]
    for name, field_dtype in iterator:
        # It's a bytes-string; decode it
        if h5py.check_string_dtype(field_dtype):
            ar = index_ar[name].astype(str)

        # It's a h5py `vlen` dtype; convert it into a list of tuples
        elif h5py.check_vlen_dtype(field_dtype):
            ar = _vlen_to_tuples(index_ar[name])

        else:
            ar = index_ar[name]
        fields_lst.append(ar)
    return pd.MultiIndex.from_arrays(fields_lst, names=field_names)
Beispiel #18
0
 def _test_dsets_read(zobj, hobj, hobj_info):
     if (hobj_info.type == h5py.h5o.TYPE_DATASET
             and self._checkdtype_structobjref(hobj) == (False, False)
             and h5py.check_vlen_dtype(hobj.dtype)):
         zval = zobj[()]