Example #1
0
    def __getitem__(self, name):
        """ Open an object in the file """
        if isinstance(name, h5r.Reference):
            oid = h5r.dereference(name, self.id)
            if oid is None:
                raise ValueError("Invalid HDF5 object reference")
        else:
            oid = h5o.open(self.id, self._e(name), lapl=self._shared.lapl)

        otype = h5i.get_type(oid)
        if otype == h5i.GROUP:
            return Group(oid)
        elif otype == h5i.DATASET:
            return dataset.Dataset(oid)
        elif otype == h5i.DATATYPE:
            return datatype.Datatype(oid)
        else:
            raise TypeError("Unknown object type")
Example #2
0
    def __getitem__(self, name):
        """ Open an object in the file """

        if isinstance(name, h5r.Reference):
            oid = h5r.dereference(name, self.id)
            if oid is None:
                raise ValueError("Invalid HDF5 object reference")
        else:
            oid = h5o.open(self.id, self._e(name), lapl=self._lapl)

        otype = h5i.get_type(oid)
        if otype == h5i.GROUP:
            return Group(oid)
        elif otype == h5i.DATASET:
            return dataset.Dataset(oid)
        elif otype == h5i.DATATYPE:
            return datatype.Datatype(oid)
        else:
            raise TypeError("Unknown object type")
Example #3
0
    def __getitem__(self, args):
        """ Read a slice from the HDF5 dataset.

        Takes slices and recarray-style field names (more than one is
        allowed!) in any order.  Obeys basic NumPy rules, including
        broadcasting.

        Also supports:

        * Boolean "mask" array indexing
        """
        args = args if isinstance(args, tuple) else (args, )

        # Sort field indices from the rest of the args.
        names = tuple(x for x in args if isinstance(x, str))
        args = tuple(x for x in args if not isinstance(x, str))

        def strip_fields(basetype):
            """ Strip extra dtype information from special types """
            if basetype.kind == 'O':
                return numpy.dtype('O')
            if basetype.fields is not None:
                if basetype.kind in ('i', 'u'):
                    return basetype.fields['enum'][0]
                fields = []
                for name in basetype.names:
                    fff = basetype.fields[name]
                    if len(fff) == 3:
                        (subtype, offset, meta) = fff
                    else:
                        subtype, meta = fff
                        offset = 0
                    subtype = strip_fields(subtype)
                    fields.append((name, subtype))
                return numpy.dtype(fields)
            return basetype

        def readtime_dtype(basetype, names):
            """ Make a NumPy dtype appropriate for reading """

            basetype = strip_fields(basetype)

            if len(names) == 0:  # Not compound, or we want all fields
                return basetype

            if basetype.names is None:  # Names provided, but not compound
                raise ValueError("Field names only allowed for compound types")

            for name in names:  # Check all names are legal
                if not name in basetype.names:
                    raise ValueError("Field %s does not appear in this type." %
                                     name)

            return numpy.dtype([(name, basetype.fields[name][0])
                                for name in names])

        if self._local.astype is not None:
            new_dtype = readtime_dtype(self._local.astype, names)
        else:
            # This is necessary because in the case of array types, NumPy
            # discards the array information at the top level.
            new_dtype = readtime_dtype(self.id.dtype, names)
        mtype = h5t.py_create(new_dtype)

        # === Special-case region references ====

        if len(args) == 1 and isinstance(args[0], h5r.RegionReference):

            obj = h5r.dereference(args[0], self.id)
            if obj != self.id:
                raise ValueError("Region reference must point to this dataset")

            sid = h5r.get_region(args[0], self.id)
            mshape = sel.guess_shape(sid)
            if mshape is None:
                return numpy.array((0, ), dtype=new_dtype)
            if numpy.product(mshape) == 0:
                return numpy.array(mshape, dtype=new_dtype)
            out = numpy.empty(mshape, dtype=new_dtype)
            sid_out = h5s.create_simple(mshape)
            sid_out.select_all()
            self.id.read(sid_out, sid, out, mtype)
            return out

        # === Check for zero-sized datasets =====

        if numpy.product(self.shape) == 0:
            # These are the only access methods NumPy allows for such objects
            if args == (Ellipsis, ) or args == tuple():
                return numpy.empty(self.shape, dtype=new_dtype)

        # === Scalar dataspaces =================

        if self.shape == ():
            fspace = self.id.get_space()
            selection = sel2.select_read(fspace, args)
            arr = numpy.ndarray(selection.mshape, dtype=new_dtype)
            for mspace, fspace in selection:
                self.id.read(mspace, fspace, arr, mtype)
            if len(names) == 1:
                arr = arr[names[0]]
            if selection.mshape is None:
                return arr[()]
            return arr

        # === Everything else ===================

        # Perform the dataspace selection.
        selection = sel.select(self.shape, args, dsid=self.id)

        if selection.nselect == 0:
            return numpy.ndarray(selection.mshape, dtype=new_dtype)

        # Up-converting to (1,) so that numpy.ndarray correctly creates
        # np.void rows in case of multi-field dtype. (issue 135)
        single_element = selection.mshape == ()
        mshape = (1, ) if single_element else selection.mshape
        arr = numpy.ndarray(mshape, new_dtype, order='C')

        # HDF5 has a bug where if the memory shape has a different rank
        # than the dataset, the read is very slow
        if len(mshape) < len(self.shape):
            # pad with ones
            mshape = (1, ) * (len(self.shape) - len(mshape)) + mshape

        # Perfom the actual read
        mspace = h5s.create_simple(mshape)
        fspace = selection._id
        self.id.read(mspace, fspace, arr, mtype)

        # Patch up the output for NumPy
        if len(names) == 1:
            arr = arr[names[0]]  # Single-field recarray convention
        if arr.shape == ():
            arr = numpy.asscalar(arr)
        if single_element:
            arr = arr[0]
        return arr
Example #4
0
    def __getitem__(self, args):
        """ Read a slice from the HDF5 dataset.

        Takes slices and recarray-style field names (more than one is
        allowed!) in any order.  Obeys basic NumPy rules, including
        broadcasting.

        Also supports:

        * Boolean "mask" array indexing
        """
        args = args if isinstance(args, tuple) else (args,)

        # Sort field indices from the rest of the args.
        names = tuple(x for x in args if isinstance(x, str))
        args = tuple(x for x in args if not isinstance(x, str))

        def strip_fields(basetype):
            """ Strip extra dtype information from special types """
            if basetype.kind == 'O':
                return numpy.dtype('O')
            if basetype.fields is not None:
                if basetype.kind in ('i','u'):
                    return basetype.fields['enum'][0]
                fields = []
                for name in basetype.names:
                    fff = basetype.fields[name]
                    if len(fff) == 3:
                        (subtype, offset, meta) = fff
                    else:
                        subtype, meta = fff
                        offset = 0
                    subtype = strip_fields(subtype)
                    fields.append((name, subtype))
                return numpy.dtype(fields)
            return basetype

        def readtime_dtype(basetype, names):
            """ Make a NumPy dtype appropriate for reading """

            basetype = strip_fields(basetype)

            if len(names) == 0:  # Not compound, or we want all fields
                return basetype

            if basetype.names is None:  # Names provided, but not compound
                raise ValueError("Field names only allowed for compound types")

            for name in names:  # Check all names are legal
                if not name in basetype.names:
                    raise ValueError("Field %s does not appear in this type." % name)

            return numpy.dtype([(name, basetype.fields[name][0]) for name in names])

        if self._local.astype is not None:
            new_dtype = readtime_dtype(self._local.astype, names)
        else:
            # This is necessary because in the case of array types, NumPy
            # discards the array information at the top level.
            new_dtype = readtime_dtype(self.id.dtype, names)
        mtype = h5t.py_create(new_dtype)

        # === Special-case region references ====

        if len(args) == 1 and isinstance(args[0], h5r.RegionReference):

            obj = h5r.dereference(args[0], self.id)
            if obj != self.id:
                raise ValueError("Region reference must point to this dataset")

            sid = h5r.get_region(args[0], self.id)
            mshape = sel.guess_shape(sid)
            if mshape is None:
                return np.array((0,), dtype=new_dtype)
            if numpy.product(mshape) == 0:
                return np.array(mshape, dtype=new_dtype)
            out = numpy.empty(mshape, dtype=new_dtype)
            sid_out = h5s.create_simple(mshape)
            sid_out.select_all()
            self.id.read(sid_out, sid, out, mtype)
            return out

        # === Check for zero-sized datasets =====

        if numpy.product(self.shape) == 0:
            # These are the only access methods NumPy allows for such objects
            if args == (Ellipsis,) or args == tuple():
                return numpy.empty(self.shape, dtype=new_dtype)
            
        # === Scalar dataspaces =================

        if self.shape == ():
            fspace = self.id.get_space()
            selection = sel2.select_read(fspace, args)
            arr = numpy.ndarray(selection.mshape, dtype=new_dtype)
            for mspace, fspace in selection:
                self.id.read(mspace, fspace, arr, mtype)
            if len(names) == 1:
                arr = arr[names[0]]
            if selection.mshape is None:
                return arr[()]
            return arr

        # === Everything else ===================

        # Perform the dataspace selection.
        selection = sel.select(self.shape, args, dsid=self.id)

        if selection.nselect == 0:
            return numpy.ndarray(selection.mshape, dtype=new_dtype)

        # Up-converting to (1,) so that numpy.ndarray correctly creates
        # np.void rows in case of multi-field dtype. (issue 135)
        single_element = selection.mshape == ()
        mshape = (1,) if single_element else selection.mshape
        arr = numpy.ndarray(mshape, new_dtype, order='C')

        # HDF5 has a bug where if the memory shape has a different rank
        # than the dataset, the read is very slow
        if len(mshape) < len(self.shape):
            # pad with ones
            mshape = (1,)*(len(self.shape)-len(mshape)) + mshape

        # Perfom the actual read
        mspace = h5s.create_simple(mshape)
        fspace = selection._id
        self.id.read(mspace, fspace, arr, mtype)

        # Patch up the output for NumPy
        if len(names) == 1:
            arr = arr[names[0]]     # Single-field recarray convention
        if arr.shape == ():
            arr = numpy.asscalar(arr)
        if single_element:
            arr = arr[0]
        return arr
Example #5
0
    def __getitem__(self, args, new_dtype=None):
        """ Read a slice from the HDF5 dataset.

        Takes slices and recarray-style field names (more than one is
        allowed!) in any order.  Obeys basic NumPy rules, including
        broadcasting.

        """
        # This boilerplate code is based on h5py.Dataset.__getitem__
        args = args if isinstance(args, tuple) else (args, )

        if new_dtype is None:
            new_dtype = getattr(self._local, 'astype', None)

        # Sort field names from the rest of the args.
        names = tuple(x for x in args if isinstance(x, str))

        if names:
            # Read a subset of the fields in this structured dtype
            if len(names) == 1:
                names = names[0]  # Read with simpler dtype of this field
            args = tuple(x for x in args if not isinstance(x, str))
            return self.fields(names, _prior_dtype=new_dtype)[args]

        if new_dtype is None:
            new_dtype = self.dtype
        mtype = h5t.py_create(new_dtype)

        # === Special-case region references ====

        if len(args) == 1 and isinstance(args[0], h5r.RegionReference):

            obj = h5r.dereference(args[0], self.id)
            if obj != self.id:
                raise ValueError("Region reference must point to this dataset")

            sid = h5r.get_region(args[0], self.id)
            mshape = guess_shape(sid)
            if mshape is None:
                # 0D with no data (NULL or deselected SCALAR)
                return Empty(new_dtype)
            out = np.empty(mshape, dtype=new_dtype)
            if out.size == 0:
                return out

            sid_out = h5s.create_simple(mshape)
            sid_out.select_all()
            self.id.read(sid_out, sid, out, mtype)
            return out

        # === END CODE FROM h5py.Dataset.__getitem__ ===

        idx = ndindex(args).reduce(self.shape)

        arr = np.ndarray(idx.newshape(self.shape), new_dtype, order='C')

        for c, index in as_subchunks(idx, self.shape, self.chunks):
            if isinstance(self.id.data_dict[c], (slice, Slice, tuple, Tuple)):
                raw_idx = Tuple(self.id.data_dict[c],
                                *[slice(0, len(i)) for i in c.args[1:]]).raw
                a = self.id._read_chunk(raw_idx)
                self.id.data_dict[c] = a

            if self.id.data_dict[c].size != 0:
                arr_idx = c.as_subindex(idx)
                arr[arr_idx.raw] = self.id.data_dict[c][index.raw]

        return arr