def test_enum(self): """ Enum type translation Literal: - TypeIntegerID Logical: - TypeEnumID - Base TypeIntegerID - 0 to (at least) 128 values """ enums = [{}, {'a': 0, 'b': 1}, dict(("%s" % d, d) for d in xrange(128)) ] bases = ('|i1', '|u1', '<i4', '>i4', '<u8') for b in bases: for e in enums: dt = h5t.py_new_enum(b, e) htype_comp = h5t.py_create(b) htype = h5t.py_create(dt) self.assert_(isinstance(htype, h5t.TypeIntegerID)) self.assertEqual(htype, htype_comp) htype = h5t.py_create(dt, logical=True) self.assert_(isinstance(htype, h5t.TypeEnumID), "%s" % (htype,)) basetype = htype.get_super() self.assertEqual(htype_comp, basetype) self.assertEqual(htype.get_nmembers(), len(e)) for idx in xrange(htype.get_nmembers()): name = htype.get_member_name(idx) value = htype.get_member_value(idx) self.assertEqual(e[name], value)
def test_ldouble_mapping(self): """ Test mapping for extended-precision """ self.assertEqual(h5t.NATIVE_LDOUBLE.dtype, np.longdouble(1).dtype) if hasattr(np, 'float96'): self.assertEqual(h5t.py_create(np.dtype('float96')).dtype, np.longdouble(1).dtype) if hasattr(np, 'float128'): self.assertEqual(h5t.py_create(np.dtype('float128')).dtype, np.longdouble(1).dtype)
def _add_typecode(tc, sizes_dict): dt_le = np.dtype('<' + tc) dt_be = np.dtype('>' + tc) entries = sizes_dict.setdefault(dt_le.itemsize, []) entries.append((h5t.py_create(dt_le), dt_le.name)) entries.append((h5t.py_create(dt_be), dt_be.name + ' (big-endian)'))
def test_enum(self): """ Enum type translation Literal: - TypeIntegerID Logical: - TypeEnumID - Base TypeIntegerID - 0 to (at least) 128 values """ enums = [{}, { 'a': 0, 'b': 1 }, dict(("%s" % d, d) for d in xrange(128))] bases = ('|i1', '|u1', '<i4', '>i4', '<u8') for b in bases: for e in enums: dt = h5t.py_new_enum(b, e) htype_comp = h5t.py_create(b) htype = h5t.py_create(dt) self.assert_(isinstance(htype, h5t.TypeIntegerID)) self.assertEqual(htype, htype_comp) htype = h5t.py_create(dt, logical=True) self.assert_(isinstance(htype, h5t.TypeEnumID), "%s" % (htype, )) basetype = htype.get_super() self.assertEqual(htype_comp, basetype) self.assertEqual(htype.get_nmembers(), len(e)) for idx in xrange(htype.get_nmembers()): name = htype.get_member_name(idx) value = htype.get_member_value(idx) self.assertEqual(e[name], value)
def test_standard_int(): it = h5t.py_create(np.dtype('<i4')) assert datatypes.fmt_dtype(it) == 'int32' assert datatypes.dtype_description(it) == '32-bit signed integer' ut = h5t.py_create(np.dtype('>u8')) assert datatypes.fmt_dtype(ut) == 'uint64 (big-endian)' assert datatypes.dtype_description(ut) == '64-bit unsigned integer'
def test_string(): # vlen string vst = h5t.py_create(h5t.string_dtype(encoding='utf-8'), logical=True) assert datatypes.fmt_dtype(vst) == 'UTF-8 string' # fixed-length string fst = h5t.py_create(h5t.string_dtype(encoding='ascii', length=3)) assert datatypes.fmt_dtype(fst) == '3-byte ASCII string'
def test_array_dtype(self): """ (Types) Array dtypes using non-tuple shapes """ dt1 = np.dtype('f4', (2,)) dt2 = np.dtype('f4', [2]) dt3 = np.dtype('f4', 2) dt4 = np.dtype('f4', 2.1) ht1 = h5t.py_create(dt1) ht2 = h5t.py_create(dt2) ht3 = h5t.py_create(dt3) ht4 = h5t.py_create(dt4) self.assertEqual(ht1.dtype, dt1) self.assertEqual(ht2.dtype, dt1) self.assertEqual(ht3.dtype, dt1) self.assertEqual(ht4.dtype, dt1)
def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ args = args if isinstance(args, tuple) else (args,) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) if len(names) != 0: raise TypeError("Field name selections are not allowed for write.") # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. if self.dtype.kind == 'V' and \ (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V'): val = numpy.asarray(val, dtype=self.dtype, order='C') else: val = numpy.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] if val.shape[-len(shp):] != shp: raise TypeError("Can't broadcast to array dimension %s" % (shp,)) mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) mshape = val.shape[0:len(val.shape)-len(shp)] else: mshape = val.shape mtype = None # Perform the dataspace selection selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return # Broadcast scalars if necessary. if (mshape == () and selection.mshape != ()): if self.dtype.subdtype is not None: raise NotImplementedError("Scalar broadcasting is not supported for array dtypes") val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype) val2[...] = val val = val2 mshape = val.shape # Perform the write, with broadcasting # Be careful to pad memory shape with ones to avoid HDF5 chunking # glitch, which kicks in for mismatched memory/file selections if(len(mshape) < len(self.shape)): mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape else: mshape_pad = mshape mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad)) for fspace in selection.broadcast(mshape): self.id.write(mspace, fspace, val, mtype)
def __init__(self, _id): # super __init__ is handled by DatasetID.__cinit__ automatically self._data_dict = None with phil: sid = self.get_space() self._shape = sid.get_simple_extent_dims() self._reshaped = False attr = h5a.open(self, b'raw_data') htype = h5t.py_create(attr.dtype) _arr = np.ndarray(attr.shape, dtype=attr.dtype, order='C') attr.read(_arr, mtype=htype) raw_data_name = _arr[()] if isinstance(raw_data_name, bytes): raw_data_name = raw_data_name.decode('utf-8') fid = h5i.get_file_id(self) g = Group(fid) self.raw_data = g[raw_data_name] self.chunks = tuple(self.raw_data.attrs['chunks']) fillvalue_a = np.empty((1,), dtype=self.dtype) dcpl = self.get_create_plist() dcpl.get_fill_value(fillvalue_a) self.fillvalue = fillvalue_a[0]
def test_complex(self): """ Complex type translation - TypeComplexID - 8, 16 bytes - LE and BE - 2 members - Member names from cfg.complex_names - Members are TypeFloatID """ bases = ('=c', '<c', '>c') for b in bases: for l in (8, 16): dt = '%s%s' % (b, l) htype = h5t.py_create(dt) self.assert_(isinstance(htype, h5t.TypeCompoundID), "wrong class") self.assertEqual(htype.get_size(), l, "wrong size") self.assertEqual(htype.get_nmembers(), 2, "wrong # members") for idx in (0, 1): self.assertEqual(htype.get_member_name(idx), cfg.complex_names[idx]) st = htype.get_member_type(idx) self.assert_(isinstance(st, h5t.TypeFloatID)) self.assertEqual(st.get_size(), l // 2) self.assertEqual(st.get_order(), bytemap[b[0]])
def test_plugins(self): shape = (32 * 1024,) chunks = (4 * 1024,) dtype = np.int64 data = np.arange(shape[0]) fname = "tmp_test_filters.h5" f = h5py.File(fname) tid = h5t.py_create(dtype, logical=1) sid = h5s.create_simple(shape, shape) # Different API's for different h5py versions. try: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None, None) except TypeError: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None) dcpl.set_filter(32008, h5z.FLAG_MANDATORY) dcpl.set_filter(32000, h5z.FLAG_MANDATORY) dset_id = h5d.create(f.id, "range", tid, sid, dcpl=dcpl) dset_id.write(h5s.ALL, h5s.ALL, data) f.close() # Make sure the filters are working outside of h5py by calling h5dump h5dump = Popen(['h5dump', fname], stdout=PIPE, stderr=STDOUT) stdout, nothing = h5dump.communicate() #print stdout err = h5dump.returncode self.assertEqual(err, 0) f = h5py.File(fname, 'r') d = f['range'][:] self.assertTrue(np.all(d == data)) f.close()
def test_plugins(self): if not H51811P: return shape = (32 * 1024, ) chunks = (4 * 1024, ) dtype = np.int64 data = np.arange(shape[0]) fname = "tmp_test_filters.h5" f = h5py.File(fname) tid = h5t.py_create(dtype, logical=1) sid = h5s.create_simple(shape, shape) # Different API's for different h5py versions. try: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None, None) except TypeError: dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None, None, None, None) dcpl.set_filter(32008, h5z.FLAG_MANDATORY) dcpl.set_filter(32000, h5z.FLAG_MANDATORY) dset_id = h5d.create(f.id, b"range", tid, sid, dcpl=dcpl) dset_id.write(h5s.ALL, h5s.ALL, data) f.close() # Make sure the filters are working outside of h5py by calling h5dump h5dump = Popen(['h5dump', fname], stdout=PIPE, stderr=STDOUT) stdout, nothing = h5dump.communicate() err = h5dump.returncode self.assertEqual(err, 0) f = h5py.File(fname, 'r') d = f['range'][:] self.assertTrue(np.all(d == data)) f.close()
def test_complex(self): """ Complex type translation - TypeComplexID - 8, 16 bytes - LE and BE - 2 members - Member names from cfg.complex_names - Members are TypeFloatID """ bases = ('=c', '<c', '>c') for b in bases: for l in (8, 16): dt = '%s%s' % (b, l) htype = h5t.py_create(dt) self.assert_(isinstance(htype, h5t.TypeCompoundID), "wrong class") self.assertEqual(htype.get_size(), l, "wrong size") self.assertEqual(htype.get_nmembers(), 2, "wrong # members") for idx in (0, 1): self.assertEqual(htype.get_member_name(idx), cfg.complex_names[idx]) st = htype.get_member_type(idx) self.assert_(isinstance(st, h5t.TypeFloatID)) self.assertEqual(st.get_size(), l//2) self.assertEqual(st.get_order(), bytemap[b[0]])
def test_detect_class(self): dt = dtype([(x, x) for x in simple_types]) htype = h5t.py_create(dt) self.assert_(htype.detect_class(h5t.INTEGER)) self.assert_(htype.detect_class(h5t.OPAQUE)) self.assert_(not htype.detect_class(h5t.ARRAY))
def test_vlstring_log(self): """ (Types) Vlen string logical is null-term HDF5 vlen ASCII string """ dt = h5t.special_dtype(vlen=str) htype = h5t.py_create(dt, logical=True) self.assertIsInstance(htype, h5t.TypeStringID) self.assertEqual(htype.is_variable_str(), True) self.assertEqual(htype.get_cset(), h5t.CSET_ASCII) self.assertEqual(htype.get_strpad(), h5t.STR_NULLTERM)
def test_ref(self): """ Reference types are correctly stored in compound types (issue 144) """ dt = np.dtype([('a', h5py.ref_dtype), ('b', '<f4')]) tid = h5t.py_create(dt, logical=True) t1, t2 = tid.get_member_type(0), tid.get_member_type(1) self.assertEqual(t1, h5t.STD_REF_OBJ) self.assertEqual(t2, h5t.IEEE_F32LE) self.assertEqual(tid.get_member_offset(0), 0) self.assertEqual(tid.get_member_offset(1), h5t.STD_REF_OBJ.get_size())
def retrieveDataObject(self): if not self.numpyData: import numpy from h5py import h5t if self.maxLength: dtype = h5t.py_create('S' + str(self.maxLength)) else: from pypies.impl.H5pyDataStore import vlen_str_type as dtype #dtype.set_strpad(h5t.STR_NULLTERM) numpyData = numpy.asarray(self.getStringData(), dtype) return numpyData
def __getHdf5Datatype(self, record): dtype = dataRecordMap[record.__class__] if dtype == types.StringType: from h5py import h5t size = record.getMaxLength() if size > 0: dtype = h5t.py_create('S' + str(size)) else: dtype = vlen_str_type #dtype.set_strpad(h5t.STR_NULLTERM) return dtype
def test_py_create_compound(self): # Compound type, each field of which is named for its type simple_compound = [(x, x) for x in simple_types] deep_compound = [('A', simple_compound), ('B', '<i4')] compound_types = [simple_compound, deep_compound] for x in compound_types: dt = dtype(x) htype = h5t.py_create(dt) self.assertEqual(type(htype), h5t.TypeCompoundID) self.assertEqual(dt, htype.dtype)
def test_py_create_array(self): shapes = [(1, 1), (1, ), (4, 5), (99, 10, 22)] array_types = [] for base in simple_types: for shape in shapes: array_types.append((base, shape)) for x in array_types: dt = dtype(x) htype = h5t.py_create(dt) self.assertEqual(type(htype), h5t.TypeArrayID) self.assertEqual(dt, htype.dtype)
def test_py_create_compound(self): # Compound type, each field of which is named for its type simple_compound = [ (x, x) for x in simple_types ] deep_compound = [ ('A', simple_compound), ('B', '<i4') ] compound_types = [simple_compound, deep_compound] for x in compound_types: dt = dtype(x) htype = h5t.py_create(dt) self.assertEqual(type(htype), h5t.TypeCompoundID) self.assertEqual(dt, htype.dtype)
def test_py_create_array(self): shapes = [ (1,1), (1,), (4,5), (99,10,22) ] array_types = [] for base in simple_types: for shape in shapes: array_types.append((base, shape)) for x in array_types: dt = dtype(x) htype = h5t.py_create(dt) self.assertEqual(type(htype), h5t.TypeArrayID) self.assertEqual(dt, htype.dtype)
def create(self, name, data, shape=None, dtype=None): """ Create a new attribute, overwriting any existing attribute. name Name of the new attribute (required) data An array to initialize the attribute (required) shape Shape of the attribute. Overrides data.shape if both are given, in which case the total number of points must be unchanged. dtype Data type of the attribute. Overrides data.dtype if both are given. """ if data is not None: data = numpy.asarray(data, order='C', dtype=dtype) if shape is None: shape = data.shape elif numpy.product(shape) != numpy.product(data.shape): raise ValueError( "Shape of new attribute conflicts with shape of data") if dtype is None: dtype = data.dtype if isinstance(dtype, h5py.Datatype): htype = dtype.id dtype = htype.dtype else: if dtype is None: dtype = numpy.dtype('f') htype = h5t.py_create(dtype, logical=True) if shape is None: raise ValueError('At least one of "shape" or "data" must be given') data = data.reshape(shape) space = h5s.create_simple(shape) if name in self: h5a.delete(self._id, self._e(name)) attr = h5a.create(self._id, self._e(name), htype, space) if data is not None: try: attr.write(data) except: attr._close() h5a.delete(self._id, self._e(name)) raise
def create(self, name, data, shape=None, dtype=None): """ Create a new attribute, overwriting any existing attribute. name Name of the new attribute (required) data An array to initialize the attribute (required) shape Shape of the attribute. Overrides data.shape if both are given, in which case the total number of points must be unchanged. dtype Data type of the attribute. Overrides data.dtype if both are given. """ with phil: if data is not None: data = numpy.asarray(data, order="C", dtype=dtype) if shape is None: shape = data.shape elif numpy.product(shape) != numpy.product(data.shape): raise ValueError("Shape of new attribute conflicts with shape of data") if dtype is None: dtype = data.dtype if isinstance(dtype, h5py.Datatype): htype = dtype.id dtype = htype.dtype else: if dtype is None: dtype = numpy.dtype("f") htype = h5t.py_create(dtype, logical=True) if shape is None: raise ValueError('At least one of "shape" or "data" must be given') data = data.reshape(shape) space = h5s.create_simple(shape) if name in self: h5a.delete(self._id, self._e(name)) attr = h5a.create(self._id, self._e(name), htype, space) if data is not None: try: attr.write(data) except: attr._close() h5a.delete(self._id, self._e(name)) raise
def create(self, name, data, shape=None, dtype=None): """ Create a new attribute, overwriting any existing attribute. name Name of the new attribute (required) data An array to initialize the attribute (required) shape Shape of the attribute. Overrides data.shape if both are given, in which case the total number of points must be unchanged. dtype Data type of the attribute. Overrides data.dtype if both are given. """ # TODO: REMOVE WHEN UNICODE VLENS IMPLEMENTED # Hack to support Unicode values (scalars only) #if isinstance(data, unicode): # unicode_hack = True # data = data.encode('utf8') #else: # unicode_hack = False if data is not None: data = numpy.asarray(data, order='C', dtype=dtype) if shape is None: shape = data.shape elif numpy.product(shape) != numpy.product(data.shape): raise ValueError("Shape of new attribute conflicts with shape of data") if dtype is None: dtype = data.dtype if dtype is None: dtype = numpy.dtype('f') if shape is None: raise ValueError('At least one of "shape" or "data" must be given') data = data.reshape(shape) space = h5s.create_simple(shape) htype = h5t.py_create(dtype, logical=True) # TODO: REMOVE WHEN UNICODE VLENS IMPLEMENTED #if unicode_hack: # htype.set_cset(h5t.CSET_UTF8) if name in self: h5a.delete(self._id, self._e(name)) attr = h5a.create(self._id, self._e(name), htype, space) if data is not None: attr.write(data)
def test_opaque(self): """ Opaque type translation - TypeOpaqueID - Sizes 1 byte to 2**31-1 bytes - Empty tag """ for l in (1, 21, 2**31-1): htype = h5t.py_create('|V%s' % l) self.assert_(isinstance(htype, h5t.TypeOpaqueID)) self.assertEqual(htype.get_size(), l) self.assertEqual(htype.get_tag(), "")
def test_opaque(self): """ Opaque type translation - TypeOpaqueID - Sizes 1 byte to 2**31-1 bytes - Empty tag """ for l in (1, 21, 2**31 - 1): htype = h5t.py_create('|V%s' % l) self.assert_(isinstance(htype, h5t.TypeOpaqueID)) self.assertEqual(htype.get_size(), l) self.assertEqual(htype.get_tag(), "")
def __setitem__(self, name, obj): """ Add an object to the group. The name must not already be in use. The action taken depends on the type of object assigned: Named HDF5 object (Dataset, Group, Datatype) A hard link is created at "name" which points to the given object. SoftLink or ExternalLink Create the corresponding link. Numpy ndarray The array is converted to a dataset object, with default settings (contiguous storage, etc.). Numpy dtype Commit a copy of the datatype as a named datatype in the file. Anything else Attempt to convert it to an ndarray and store it. Scalar values are stored as scalar datasets. Raise ValueError if we can't understand the resulting array dtype. """ name, lcpl = self._e(name, lcpl=True) if isinstance(obj, HLObject): h5o.link(obj.id, self.id, name, lcpl=lcpl, lapl=self._lapl) elif isinstance(obj, SoftLink): self.id.links.create_soft(name, self._e(obj.path), lcpl=lcpl, lapl=self._lapl) elif isinstance(obj, ExternalLink): self.id.links.create_external(name, self._e(obj.filename), self._e(obj.path), lcpl=lcpl, lapl=self._lapl) elif isinstance(obj, numpy.dtype): htype = h5t.py_create(obj) htype.commit(self.id, name, lcpl=lcpl) else: ds = self.create_dataset(None, data=obj, dtype=base.guess_dtype(obj)) h5o.link(ds.id, self.id, name, lcpl=lcpl)
def test_out_of_order_offsets(self): size = 20 type_dict = { 'names': ['f1', 'f2', 'f3'], 'formats': ['<f4', '<i4', '<f8'], 'offsets': [0, 16, 8] } expected_dtype = np.dtype(type_dict) tid = h5t.create(h5t.COMPOUND, size) for name, offset, dt in zip(type_dict["names"], type_dict["offsets"], type_dict["formats"]): tid.insert( name.encode("utf8") if isinstance(name, text_type) else name, offset, h5t.py_create(dt)) self.assertEqual(tid.dtype, expected_dtype) self.assertEqual(tid.dtype.itemsize, size)
def test_names(self): names = [('r','i'), ('real', 'imag'), (' real name ', ' img name '), (' Re!#@$%\t\tREALr\neal ^;;;"<>? ', ' \t*&^ . ^@IMGI MG!~\t\n\r') ] complex_types = [x for x in simple_types if 'c' in x] config = h5.get_config() oldnames = config.complex_names try: for name in names: config.complex_names = name for ctype in complex_types: dt = dtype(ctype) htype = h5t.py_create(dt) self.assertEqual(type(htype), h5t.TypeCompoundID) self.assertEqual(htype.get_nmembers(), 2) self.assertEqual(htype.get_member_name(0), name[0]) self.assertEqual(htype.get_member_name(1), name[1]) finally: config.complex_names = oldnames
def test_boolean(self): """ Boolean type translation - TypeEnumID - Base TypeIntegerID - Base 1 byte - Base signed - Member names from cfg.bool_names - 2 values - Values 0, 1 """ htype = h5t.py_create('bool') self.assert_(isinstance(htype, h5t.TypeEnumID), "wrong class") self.assertEqual(htype.get_nmembers(), 2, "must be 2-element enum") basetype = htype.get_super() self.assertEqual(basetype.get_size(), 1, "wrong size") self.assertEqual(basetype.get_sign(), h5t.SGN_2, "wrong sign") for idx in (0,1): self.assertEqual(htype.get_member_name(idx), cfg.bool_names[idx], "wrong name") self.assertEqual(htype.get_member_value(idx), idx, "wrong value")
def test_names(self): names = [('r', 'i'), ('real', 'imag'), (' real name ', ' img name '), (' Re!#@$%\t\tREALr\neal ^;;;"<>? ', ' \t*&^ . ^@IMGI MG!~\t\n\r')] complex_types = [x for x in simple_types if 'c' in x] config = h5.get_config() oldnames = config.complex_names try: for name in names: config.complex_names = name for ctype in complex_types: dt = dtype(ctype) htype = h5t.py_create(dt) self.assertEqual(type(htype), h5t.TypeCompoundID) self.assertEqual(htype.get_nmembers(), 2) self.assertEqual(htype.get_member_name(0), name[0]) self.assertEqual(htype.get_member_name(1), name[1]) finally: config.complex_names = oldnames
def test_boolean(self): """ Boolean type translation - TypeEnumID - Base TypeIntegerID - Base 1 byte - Base signed - Member names from cfg.bool_names - 2 values - Values 0, 1 """ htype = h5t.py_create('bool') self.assert_(isinstance(htype, h5t.TypeEnumID), "wrong class") self.assertEqual(htype.get_nmembers(), 2, "must be 2-element enum") basetype = htype.get_super() self.assertEqual(basetype.get_size(), 1, "wrong size") self.assertEqual(basetype.get_sign(), h5t.SGN_2, "wrong sign") for idx in (0, 1): self.assertEqual(htype.get_member_name(idx), cfg.bool_names[idx], "wrong name") self.assertEqual(htype.get_member_value(idx), idx, "wrong value")
def test_out_of_order_offsets(self): size = 20 type_dict = { 'names': ['f1', 'f2', 'f3'], 'formats': ['<f4', '<i4', '<f8'], 'offsets': [0, 16, 8] } expected_dtype = np.dtype(type_dict) tid = h5t.create(h5t.COMPOUND, size) for name, offset, dt in zip( type_dict["names"], type_dict["offsets"], type_dict["formats"] ): tid.insert( name.encode("utf8") if isinstance(name, text_type) else name, offset, h5t.py_create(dt) ) self.assertEqual(tid.dtype, expected_dtype) self.assertEqual(tid.dtype.itemsize, size)
def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ args = args if isinstance(args, tuple) else (args,) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) if len(names) != 0: raise TypeError("Field name selections are not allowed for write.") # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. if self.dtype.kind == "O" or \ (self.dtype.kind == 'V' and \ (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \ (self.dtype.subdtype == None)): val = numpy.asarray(val, dtype=self.dtype, order='C') else: val = numpy.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] valshp = val.shape[-len(shp):] if valshp != shp: # Last dimension has to match raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,)) mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) mshape = val.shape[0:len(val.shape)-len(shp)] else: mshape = val.shape mtype = None # Perform the dataspace selection selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return # Broadcast scalars if necessary. if (mshape == () and selection.mshape != ()): if self.dtype.subdtype is not None: raise TypeError("Scalar broadcasting is not supported for array dtypes") val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype) val2[...] = val val = val2 mshape = val.shape # Perform the write, with broadcasting # Be careful to pad memory shape with ones to avoid HDF5 chunking # glitch, which kicks in for mismatched memory/file selections if(len(mshape) < len(self.shape)): mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape else: mshape_pad = mshape mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad)) for fspace in selection.broadcast(mshape): self.id.write(mspace, fspace, val, mtype)
def make_new_dset(parent, shape=None, dtype=None, data=None, chunks=None, compression=None, shuffle=None, fletcher32=None, maxshape=None, compression_opts=None, fillvalue=None, scaleoffset=None, track_times=None): """ Return a new low-level dataset identifier Only creates anonymous datasets. """ # Convert data to a C-contiguous ndarray if data is not None: import base data = numpy.asarray(data, order="C", dtype=base.guess_dtype(data)) # Validate shape if shape is None: if data is None: raise TypeError("Either data or shape must be specified") shape = data.shape else: shape = tuple(shape) if data is not None and (numpy.product(shape) != numpy.product(data.shape)): raise ValueError("Shape tuple is incompatible with data") # Validate dtype if dtype is None and data is None: dtype = numpy.dtype("=f4") elif dtype is None and data is not None: dtype = data.dtype else: dtype = numpy.dtype(dtype) # Legacy if any((compression, shuffle, fletcher32, maxshape,scaleoffset)) and chunks is False: raise ValueError("Chunked format required for given storage options") # Legacy if compression is True: if compression_opts is None: compression_opts = 4 compression = 'gzip' # Legacy if compression in range(10): if compression_opts is not None: raise TypeError("Conflict in compression options") compression_opts = compression compression = 'gzip' dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts, shuffle, fletcher32, maxshape, scaleoffset) if fillvalue is not None: fillvalue = numpy.array(fillvalue) dcpl.set_fill_value(fillvalue) if track_times in (True, False): dcpl.set_obj_track_times(track_times) elif track_times is not None: raise TypeError("track_times must be either True or False") if maxshape is not None: maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) sid = h5s.create_simple(shape, maxshape) tid = h5t.py_create(dtype, logical=1) dset_id = h5d.create(parent.id, None, tid, sid, dcpl=dcpl) if data is not None: dset_id.write(h5s.ALL, h5s.ALL, data) return dset_id
def __getitem__(self, args): """ Read a slice from the HDF5 dataset. Takes slices and recarray-style field names (more than one is allowed!) in any order. Obeys basic NumPy rules, including broadcasting. Also supports: * Boolean "mask" array indexing """ args = args if isinstance(args, tuple) else (args,) # Sort field indices from the rest of the args. names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) def strip_fields(basetype): """ Strip extra dtype information from special types """ if basetype.kind == 'O': return numpy.dtype('O') if basetype.fields is not None: if basetype.kind in ('i','u'): return basetype.fields['enum'][0] fields = [] for name in basetype.names: fff = basetype.fields[name] if len(fff) == 3: (subtype, offset, meta) = fff else: subtype, meta = fff offset = 0 subtype = strip_fields(subtype) fields.append((name, subtype)) return numpy.dtype(fields) return basetype def readtime_dtype(basetype, names): """ Make a NumPy dtype appropriate for reading """ basetype = strip_fields(basetype) if len(names) == 0: # Not compound, or we want all fields return basetype if basetype.names is None: # Names provided, but not compound raise ValueError("Field names only allowed for compound types") for name in names: # Check all names are legal if not name in basetype.names: raise ValueError("Field %s does not appear in this type." % name) return numpy.dtype([(name, basetype.fields[name][0]) for name in names]) # This is necessary because in the case of array types, NumPy # discards the array information at the top level. new_dtype = readtime_dtype(self.id.dtype, names) mtype = h5t.py_create(new_dtype) # === Scalar dataspaces ================= if self.shape == (): fspace = self.id.get_space() selection = sel2.select_read(fspace, args) arr = numpy.ndarray(selection.mshape, dtype=new_dtype) for mspace, fspace in selection: self.id.read(mspace, fspace, arr, mtype) if selection.mshape is None: return arr[()] return arr # === Everything else =================== # Perform the dataspace selection. selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return numpy.ndarray((0,), dtype=new_dtype) # Up-converting to (1,) so that numpy.ndarray correctly creates # np.void rows in case of multi-field dtype. (issue 135) single_element = selection.mshape == () mshape = (1,) if single_element else selection.mshape arr = numpy.ndarray(mshape, new_dtype, order='C') # HDF5 has a bug where if the memory shape has a different rank # than the dataset, the read is very slow if len(mshape) < len(self.shape): # pad with ones mshape = (1,)*(len(self.shape)-len(mshape)) + mshape # Perfom the actual read mspace = h5s.create_simple(mshape) fspace = selection._id self.id.read(mspace, fspace, arr, mtype) # Patch up the output for NumPy if len(names) == 1: arr = arr[names[0]] # Single-field recarray convention if arr.shape == (): arr = numpy.asscalar(arr) if single_element: arr = arr[0] return arr
def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ args = args if isinstance(args, tuple) else (args, ) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. vlen = h5t.check_dtype(vlen=self.dtype) if vlen not in (bytes, unicode, None): try: val = numpy.asarray(val, dtype=vlen) except ValueError: try: val = numpy.array( [numpy.array(x, dtype=vlen) for x in val], dtype=self.dtype) except ValueError: pass if vlen == val.dtype: if val.ndim > 1: tmp = numpy.empty(shape=val.shape[:-1], dtype=object) tmp.ravel()[:] = [ i for i in val.reshape((numpy.product(val.shape[:-1]), val.shape[-1])) ] else: tmp = numpy.array([None], dtype=object) tmp[0] = val val = tmp elif self.dtype.kind == "O" or \ (self.dtype.kind == 'V' and \ (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \ (self.dtype.subdtype == None)): if len(names) == 1 and self.dtype.fields is not None: # Single field selected for write, from a non-array source if not names[0] in self.dtype.fields: raise ValueError("No such field for indexing: %s" % names[0]) dtype = self.dtype.fields[names[0]][0] cast_compound = True else: dtype = self.dtype cast_compound = False val = numpy.asarray(val, dtype=dtype, order='C') if cast_compound: val = val.astype(numpy.dtype([(names[0], dtype)])) else: val = numpy.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] valshp = val.shape[-len(shp):] if valshp != shp: # Last dimension has to match raise TypeError( "When writing to array types, last N dimensions have to match (got %s, but should be %s)" % ( valshp, shp, )) mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) mshape = val.shape[0:len(val.shape) - len(shp)] # Make a compound memory type if field-name slicing is required elif len(names) != 0: mshape = val.shape # Catch common errors if self.dtype.fields is None: raise TypeError( "Illegal slicing argument (not a compound dataset)") mismatch = [x for x in names if x not in self.dtype.fields] if len(mismatch) != 0: mismatch = ", ".join('"%s"' % x for x in mismatch) raise ValueError( "Illegal slicing argument (fields %s not in dataset type)" % mismatch) # Write non-compound source into a single dataset field if len(names) == 1 and val.dtype.fields is None: subtype = h5y.py_create(val.dtype) mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) mtype.insert(self._e(names[0]), 0, subtype) # Make a new source type keeping only the requested fields else: fieldnames = [x for x in val.dtype.names if x in names] # Keep source order mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) for fieldname in fieldnames: subtype = h5t.py_create(val.dtype.fields[fieldname][0]) offset = val.dtype.fields[fieldname][1] mtype.insert(self._e(fieldname), offset, subtype) # Use mtype derived from array (let DatasetID.write figure it out) else: mshape = val.shape mtype = None # Perform the dataspace selection selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return # Broadcast scalars if necessary. if (mshape == () and selection.mshape != ()): if self.dtype.subdtype is not None: raise TypeError( "Scalar broadcasting is not supported for array dtypes") val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype) val2[...] = val val = val2 mshape = val.shape # Perform the write, with broadcasting # Be careful to pad memory shape with ones to avoid HDF5 chunking # glitch, which kicks in for mismatched memory/file selections if (len(mshape) < len(self.shape)): mshape_pad = (1, ) * (len(self.shape) - len(mshape)) + mshape else: mshape_pad = mshape mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED, ) * len(mshape_pad)) for fspace in selection.broadcast(mshape): self.id.write(mspace, fspace, val, mtype)
def make_new_dset(parent, shape=None, dtype=None, data=None, chunks=None, compression=None, shuffle=None, fletcher32=None, maxshape=None, compression_opts=None, fillvalue=None, scaleoffset=None, track_times=None): """ Return a new low-level dataset identifier Only creates anonymous datasets. """ # Convert data to a C-contiguous ndarray if data is not None: import base data = numpy.asarray(data, order="C", dtype=base.guess_dtype(data)) # Validate shape if shape is None: if data is None: raise TypeError("Either data or shape must be specified") shape = data.shape else: shape = tuple(shape) if data is not None and (numpy.product(shape) != numpy.product(data.shape)): raise ValueError("Shape tuple is incompatible with data") tmp_shape = maxshape if maxshape is not None else shape # Validate chunk shape if isinstance(chunks, tuple) and (-numpy.array([ i>=j for i,j in zip(tmp_shape,chunks) if i is not None])).any(): errmsg = "Chunk shape must not be greater than data shape in any dimension. "\ "{} is not compatible with {}".format(chunks, shape) raise ValueError(errmsg) if isinstance(dtype, h5py.Datatype): # Named types are used as-is tid = dtype.id dtype = tid.dtype # Following code needs this else: # Validate dtype if dtype is None and data is None: dtype = numpy.dtype("=f4") elif dtype is None and data is not None: dtype = data.dtype else: dtype = numpy.dtype(dtype) tid = h5t.py_create(dtype, logical=1) # Legacy if any((compression, shuffle, fletcher32, maxshape,scaleoffset)) and chunks is False: raise ValueError("Chunked format required for given storage options") # Legacy if compression is True: if compression_opts is None: compression_opts = 4 compression = 'gzip' # Legacy if compression in _LEGACY_GZIP_COMPRESSION_VALS: if compression_opts is not None: raise TypeError("Conflict in compression options") compression_opts = compression compression = 'gzip' dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts, shuffle, fletcher32, maxshape, scaleoffset) if fillvalue is not None: fillvalue = numpy.array(fillvalue) dcpl.set_fill_value(fillvalue) if track_times in (True, False): dcpl.set_obj_track_times(track_times) elif track_times is not None: raise TypeError("track_times must be either True or False") if maxshape is not None: maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) sid = h5s.create_simple(shape, maxshape) dset_id = h5d.create(parent.id, None, tid, sid, dcpl=dcpl) if data is not None: dset_id.write(h5s.ALL, h5s.ALL, data) return dset_id
def concatenate(data_list, out_group=None, start=None, stop=None, datasets=None, dataset_filter=None): """Concatenate data along the time axis. All :class:`TOData` objects to be concatenated are assumed to have the same datasets and index_maps with compatible shapes and data types. Currently only 'time' axis concatenation is supported, and it must be the fastest varying index. All attributes, history, and other non-time-dependant information is copied from the first item. Parameters ---------- data_list : list of :class:`TOData`. These are assumed to be identical in every way except along the axes representing time, over which they are concatenated. All other data and attributes are simply copied from the first entry of the list. out_group : `h5py.Group`, hdf5 filename or `memh5.Group` Underlying hdf5 like container that will store the data for the BaseData instance. start : int or dict with keys ``data_list[0].time_axes`` In the aggregate datasets at what index to start. Every thing before this index is excluded. stop : int or dict with keys ``data_list[0].time_axes`` In the aggregate datasets at what index to stop. Every thing after this index is excluded. datasets : sequence of strings Which datasets to include. Default is all of them. dataset_filter : callable Function for preprocessing all datasets. Useful for changing data types etc. Should return a dataset. Returns ------- data : :class:`TOData` """ if dataset_filter is None: dataset_filter = lambda d: d # Inspect first entry in the list to get constant parts.. first_data = data_list[0] concatenation_axes = first_data.time_axes # Ensure *start* and *stop* are mappings. if not hasattr(start, '__getitem__'): start = {axis : start for axis in concatenation_axes} if not hasattr(stop, '__getitem__'): stop = {axis : stop for axis in concatenation_axes} # Get the length of all axes for which we are concatenating. concat_index_lengths = {axis : 0 for axis in concatenation_axes} for data in data_list: for index_name in concatenation_axes: if index_name not in data.index_map.keys(): continue concat_index_lengths[index_name] += len(data.index_map[index_name]) # Get real start and stop indexes. for axis in concatenation_axes: start[axis], stop[axis] = _start_stop_inds( start.get(axis, None), stop.get(axis, None), concat_index_lengths[axis], ) if first_data.distributed and not isinstance(out_group, h5py.Group): distributed = True comm = first_data.comm else: distributed = False comm = None # Choose return class and initialize the object. out = first_data.__class__(out_group, distributed=distributed, comm=comm) # Resolve the index maps. XXX Shouldn't be nessisary after fix to # _copy_non_time_data. for axis, index_map in first_data.index_map.items(): if axis in concatenation_axes: # Initialize the dataset. dtype = index_map.dtype out.create_index_map( axis, np.empty(shape=(stop[axis] - start[axis],), dtype=dtype), ) else: # Just copy it. out.create_index_map(axis, index_map) all_dataset_names = _copy_non_time_data(data_list, out) if datasets is None: dataset_names = all_dataset_names else: dataset_names = datasets current_concat_index_start = {axis : 0 for axis in concatenation_axes} # Now loop over the list and copy the data. for data in data_list: # Get the concatenation axis lengths for this BaseData. current_concat_index_n = {axis : len(data.index_map.get(axis, [])) for axis in concatenation_axes} # Start with the index_map. for axis in concatenation_axes: axis_finished = current_concat_index_start[axis] >= stop[axis] axis_not_started = (current_concat_index_start[axis] + current_concat_index_n[axis] <= start[axis]) if axis_finished or axis_not_started: continue in_slice, out_slice = _get_in_out_slice( start[axis], stop[axis], current_concat_index_start[axis], current_concat_index_n[axis], ) out.index_map[axis][out_slice] = data.index_map[axis][in_slice] # Now copy over the datasets and flags. this_dataset_names = _copy_non_time_data(data) for name in this_dataset_names: dataset = data[name] if name not in dataset_names: continue attrs = dataset.attrs dataset = dataset_filter(dataset) if hasattr(dataset, "attrs"): # Some filters modify the attributes; others return a thing # without attributes. So we need to check. attrs = dataset.attrs # For now only support concatenation over minor axis. axis = attrs['axis'][-1] if axis not in concatenation_axes: msg = "Dataset %s does not have a valid concatenation axis." raise ValueError(msg % name) axis_finished = current_concat_index_start[axis] >= stop[axis] axis_not_started = (current_concat_index_start[axis] + current_concat_index_n[axis] <= start[axis]) if axis_finished or axis_not_started: continue # Place holder for eventual implementation of 'axis_rate' attribute. axis_rate = 1 # If this is the first piece of data, initialize the output # dataset. #out_keys = ['flags/' + n for n in out.flags.keys()] #out_keys += out.datasets.keys() if name not in out: shape = dataset.shape dtype = dataset.dtype full_shape = shape[:-1] + ((stop[axis] - start[axis]) * \ axis_rate,) if (distributed and isinstance(dataset, memh5.MemDatasetDistributed)): new_dset = out.create_dataset( name, shape=full_shape, dtype=dtype, distributed=True, distributed_axis=dataset.distributed_axis, ) else: new_dset = out.create_dataset(name, shape=full_shape, dtype=dtype) memh5.copyattrs(attrs, new_dset.attrs) out_dset = out[name] in_slice, out_slice = _get_in_out_slice( start[axis] * axis_rate, stop[axis] * axis_rate, current_concat_index_start[axis] * axis_rate, current_concat_index_n[axis] * axis_rate, ) # Awkward special case for pure subarray dtypes, which h5py and # numpy treat differently. out_dtype = out_dset.dtype if (out_dtype.kind == 'V' and not out_dtype.fields and out_dtype.shape and isinstance(out_dset, h5py.Dataset)): #index_pairs = zip(range(dataset.shape[-1])[in_slice], # range(out_dset.shape[-1])[out_slice]) # Drop down to low level interface. I think this is only # nessisary for pretty old h5py. from h5py import h5t from h5py._hl import selections mtype = h5t.py_create(out_dtype) mdata = dataset[..., in_slice].copy().flat[:] mspace = selections.SimpleSelection( (mdata.size // out_dtype.itemsize,)).id fspace = selections.select(out_dset.shape, out_slice, out_dset.id).id out_dset.id.write(mspace, fspace, mdata, mtype) else: out_dset[..., out_slice] = dataset[..., in_slice] # Increment the start indexes for the next item of the list. for axis in current_concat_index_start.keys(): current_concat_index_start[axis] += current_concat_index_n[axis] return out
def test_standard_float(): ft = h5t.py_create(np.dtype('<f4')) assert datatypes.fmt_dtype(ft) == 'float32' assert datatypes.dtype_description(ft) == '32-bit floating point'
def test_bool(self): out = h5t.py_create('bool') self.assert_(isinstance(out, h5t.TypeEnumID)) self.assertEqual(out.get_nmembers(),2) self.assertEqual(out.dtype, dtype('bool'))
def test_vlen(): vt_n = h5t.vlen_dtype(np.dtype('<i2')) vt_h = h5t.py_create(vt_n, logical=True) assert datatypes.fmt_dtype(vt_h) == 'vlen array of int16'
def __setitem__(self, args, val): """ Write to the HDF5 dataset from a Numpy array. NumPy's broadcasting rules are honored, for "simple" indexing (slices and integers). For advanced indexing, the shapes must match. """ args = args if isinstance(args, tuple) else (args,) # Sort field indices from the slicing names = tuple(x for x in args if isinstance(x, str)) args = tuple(x for x in args if not isinstance(x, str)) # Generally we try to avoid converting the arrays on the Python # side. However, for compound literals this is unavoidable. if self.dtype.kind == "O" or \ (self.dtype.kind == 'V' and \ (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \ (self.dtype.subdtype == None)): if len(names) == 1 and self.dtype.fields is not None: # Single field selected for write, from a non-array source if not names[0] in self.dtype.fields: raise ValueError("No such field for indexing: %s" % names[0]) dtype = self.dtype.fields[names[0]][0] cast_compound = True else: dtype = self.dtype cast_compound = False val = numpy.asarray(val, dtype=dtype, order='C') if cast_compound: val = val.astype(numpy.dtype([(names[0], dtype)])) else: val = numpy.asarray(val, order='C') # Check for array dtype compatibility and convert if self.dtype.subdtype is not None: shp = self.dtype.subdtype[1] valshp = val.shape[-len(shp):] if valshp != shp: # Last dimension has to match raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,)) mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) mshape = val.shape[0:len(val.shape)-len(shp)] # Make a compound memory type if field-name slicing is required elif len(names) != 0: mshape = val.shape # Catch common errors if self.dtype.fields is None: raise TypeError("Illegal slicing argument (not a compound dataset)") mismatch = [x for x in names if x not in self.dtype.fields] if len(mismatch) != 0: mismatch = ", ".join('"%s"'%x for x in mismatch) raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch) # Write non-compound source into a single dataset field if len(names) == 1 and val.dtype.fields is None: subtype = h5y.py_create(val.dtype) mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) mtype.insert(self._e(names[0]), 0, subtype) # Make a new source type keeping only the requested fields else: fieldnames = [x for x in val.dtype.names if x in names] # Keep source order mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) for fieldname in fieldnames: subtype = h5t.py_create(val.dtype.fields[fieldname][0]) offset = val.dtype.fields[fieldname][1] mtype.insert(self._e(fieldname), offset, subtype) # Use mtype derived from array (let DatasetID.write figure it out) else: mshape = val.shape mtype = None # Perform the dataspace selection selection = sel.select(self.shape, args, dsid=self.id) if selection.nselect == 0: return # Broadcast scalars if necessary. if (mshape == () and selection.mshape != ()): if self.dtype.subdtype is not None: raise TypeError("Scalar broadcasting is not supported for array dtypes") val2 = numpy.empty(selection.mshape[-1], dtype=val.dtype) val2[...] = val val = val2 mshape = val.shape # Perform the write, with broadcasting # Be careful to pad memory shape with ones to avoid HDF5 chunking # glitch, which kicks in for mismatched memory/file selections if(len(mshape) < len(self.shape)): mshape_pad = (1,)*(len(self.shape)-len(mshape)) + mshape else: mshape_pad = mshape mspace = h5s.create_simple(mshape_pad, (h5s.UNLIMITED,)*len(mshape_pad)) for fspace in selection.broadcast(mshape): self.id.write(mspace, fspace, val, mtype)
def test(dt): """ Check get_super for a given dtype """ htype = h5t.py_create(dt) atype = h5t.array_create(htype, (4,5)) self.assert_(htype.equal(atype.get_super()))
def test_compound(): ct_n = np.dtype([('x', np.float32), ('y', np.float32)]) ct_h = h5t.py_create(ct_n) assert datatypes.fmt_dtype(ct_h) == '(x: float32, y: float32)'
from unittest import TestCase import numpy as np from h5py import h5t import ctypes strings = ["Hi", "Hello", "This is a string", "HDF5 is awesome!"] vlen_dtype = h5t.special_dtype(vlen=str) vlen_htype = h5t.py_create(vlen_dtype, logical=1) obj_htype = h5t.py_create(vlen_dtype) class TestVlenObject(TestCase): """ Test conversion routines between string vlens and object pointers """ def test_obj2vlen_simple(self): """ Object to vlen (contiguous) """ objarr = np.array(strings, dtype=vlen_dtype) destbuffer = np.ndarray(objarr.shape, dtype=np.uintp, buffer=objarr).copy() h5t.convert(obj_htype, vlen_htype, len(strings), destbuffer) for idx, val in enumerate(destbuffer): self.assertEqual(ctypes.string_at(int(val)), strings[idx])
def test_enum(): et_n = h5t.enum_dtype({'apple': 1, 'banana': 2}) et_h = h5t.py_create(et_n, logical=True) assert datatypes.fmt_dtype(et_h) == 'enum (apple, banana)'
def create_compact_dataset(loc, name, shape=None, dtype=None, data=None, chunks=None, compression=None, shuffle=None, fletcher32=None, maxshape=None, compression_opts=None, fillvalue=None, scaleoffset=None, track_times=None): """Create a new HDF5 dataset with a compact storage layout.""" # Convert data to a C-contiguous ndarray if data is not None: import h5py._hl.base data = numpy.asarray(data, order="C", dtype=h5py._hl.base.guess_dtype(data)) # Validate shape if shape is None: if data is None: raise TypeError("Either data or shape must be specified") shape = data.shape else: shape = tuple(shape) if data is not None and (numpy.product(shape) != numpy.product(data.shape)): raise ValueError("Shape tuple is incompatible with data") if isinstance(dtype, h5py.Datatype): # Named types are used as-is tid = dtype.id dtype = tid.dtype # Following code needs this else: # Validate dtype if dtype is None and data is None: dtype = numpy.dtype("=f4") elif dtype is None and data is not None: dtype = data.dtype else: dtype = numpy.dtype(dtype) tid = h5t.py_create(dtype, logical=1) # Legacy if any((compression, shuffle, fletcher32, maxshape,scaleoffset)) and chunks is False: raise ValueError("Chunked format required for given storage options") # Legacy if compression is True: if compression_opts is None: compression_opts = 4 compression = 'gzip' # Legacy if compression in range(10): if compression_opts is not None: raise TypeError("Conflict in compression options") compression_opts = compression compression = 'gzip' if h5py.version.version_tuple >= (2, 2, 0, ''): dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts, shuffle, fletcher32, maxshape, None) else: dcpl = filters.generate_dcpl(shape, dtype, chunks, compression, compression_opts, shuffle, fletcher32, maxshape) if fillvalue is not None: fillvalue = numpy.array(fillvalue) dcpl.set_fill_value(fillvalue) if track_times in (True, False): dcpl.set_obj_track_times(track_times) elif track_times is not None: raise TypeError("track_times must be either True or False") dcpl.set_layout(h5d.COMPACT) if maxshape is not None: maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) sid = h5s.create_simple(shape, maxshape) dset_id = h5d.create(loc.id, None, tid, sid, dcpl=dcpl) if data is not None: dset_id.write(h5s.ALL, h5s.ALL, data) dset = dataset.Dataset(dset_id) if name is not None: loc[name] = dset return dset
def test_array(): at_n = np.dtype((np.float64, (3, 4))) at_h = h5t.py_create(at_n) assert datatypes.fmt_dtype(at_h) == '3 × 4 array of float64'
def concatenate(data_list, out_group=None, start=None, stop=None, datasets=None, dataset_filter=None): """Concatenate data along the time axis. All :class:`TOData` objects to be concatenated are assumed to have the same datasets and index_maps with compatible shapes and data types. Currently only 'time' axis concatenation is supported, and it must be the fastest varying index. All attributes, history, and other non-time-dependant information is copied from the first item. Parameters ---------- data_list : list of :class:`TOData`. These are assumed to be identical in every way except along the axes representing time, over which they are concatenated. All other data and attributes are simply copied from the first entry of the list. out_group : `h5py.Group`, hdf5 filename or `memh5.Group` Underlying hdf5 like container that will store the data for the BaseData instance. start : int or dict with keys ``data_list[0].time_axes`` In the aggregate datasets at what index to start. Every thing before this index is excluded. stop : int or dict with keys ``data_list[0].time_axes`` In the aggregate datasets at what index to stop. Every thing after this index is excluded. datasets : sequence of strings Which datasets to include. Default is all of them. dataset_filter : callable with one or two arguments Function for preprocessing all datasets. Useful for changing data types etc. Takes a dataset as an arguement and should return a dataset (either h5py or memh5). Optionally may accept a second argument that is slice along the time axis, which the filter should apply. Returns ------- data : :class:`TOData` """ if dataset_filter is None: def dataset_filter(d): return d filter_time_slice = len(inspect.getargspec(dataset_filter).args) == 2 # Inspect first entry in the list to get constant parts.. first_data = data_list[0] concatenation_axes = first_data.time_axes # Ensure *start* and *stop* are mappings. if not hasattr(start, '__getitem__'): start = {axis: start for axis in concatenation_axes} if not hasattr(stop, '__getitem__'): stop = {axis: stop for axis in concatenation_axes} # Get the length of all axes for which we are concatenating. concat_index_lengths = {axis: 0 for axis in concatenation_axes} for data in data_list: for index_name in concatenation_axes: if index_name not in data.index_map: continue concat_index_lengths[index_name] += len(data.index_map[index_name]) # Get real start and stop indexes. for axis in concatenation_axes: start[axis], stop[axis] = _start_stop_inds( start.get(axis, None), stop.get(axis, None), concat_index_lengths[axis], ) if first_data.distributed and not isinstance(out_group, h5py.Group): distributed = True comm = first_data.comm else: distributed = False comm = None # Choose return class and initialize the object. out = first_data.__class__(out_group, distributed=distributed, comm=comm) # Resolve the index maps. XXX Shouldn't be nessisary after fix to # _copy_non_time_data. for axis, index_map in first_data.index_map.items(): if axis in concatenation_axes: # Initialize the dataset. dtype = index_map.dtype out.create_index_map( axis, np.empty(shape=(stop[axis] - start[axis],), dtype=dtype), ) else: # Just copy it. out.create_index_map(axis, index_map) # Copy over the reverse maps. for axis, reverse_map in first_data.reverse_map.items(): out.create_reverse_map(axis, reverse_map) all_dataset_names = _copy_non_time_data(data_list, out) if datasets is None: dataset_names = all_dataset_names else: dataset_names = datasets current_concat_index_start = {axis: 0 for axis in concatenation_axes} # Now loop over the list and copy the data. for data in data_list: # Get the concatenation axis lengths for this BaseData. current_concat_index_n = {axis: len(data.index_map.get(axis, [])) for axis in concatenation_axes} # Start with the index_map. for axis in concatenation_axes: axis_finished = current_concat_index_start[axis] >= stop[axis] axis_not_started = (current_concat_index_start[axis] + current_concat_index_n[axis] <= start[axis]) if axis_finished or axis_not_started: continue in_slice, out_slice = _get_in_out_slice( start[axis], stop[axis], current_concat_index_start[axis], current_concat_index_n[axis], ) out.index_map[axis][out_slice] = data.index_map[axis][in_slice] # Now copy over the datasets and flags. this_dataset_names = _copy_non_time_data(data) for name in this_dataset_names: dataset = data[name] if name not in dataset_names: continue attrs = dataset.attrs # Figure out which axis we are concatenating over. for a in memh5.bytes_to_unicode(attrs['axis']): if a in concatenation_axes: axis = a break else: msg = "Dataset %s does not have a valid concatenation axis." raise ValueError(msg % name) # Figure out where we are in that axis and how to slice it. axis_finished = current_concat_index_start[axis] >= stop[axis] axis_not_started = (current_concat_index_start[axis] + current_concat_index_n[axis] <= start[axis]) if axis_finished or axis_not_started: continue axis_rate = 1 # Place holder for eventual implementation. in_slice, out_slice = _get_in_out_slice( start[axis] * axis_rate, stop[axis] * axis_rate, current_concat_index_start[axis] * axis_rate, current_concat_index_n[axis] * axis_rate, ) # Filter the dataset. if filter_time_slice: dataset = dataset_filter(dataset, in_slice) else: dataset = dataset_filter(dataset) if hasattr(dataset, "attrs"): # Some filters modify the attributes; others return a thing # without attributes. So we need to check. attrs = dataset.attrs # Do this *after* the filter, in case filter changed axis order. axis_ind = list(memh5.bytes_to_unicode(attrs['axis'])).index(axis) # Slice input data if the filter doesn't do it. if not filter_time_slice: in_slice = (slice(None),) * axis_ind + (in_slice,) dataset = dataset[in_slice] # The time slice filter above will convert dataset from a MemDataset # instance to either an MPIArray or np.ndarray (depending on if # it is distributed). Need to convert back to the appropriate # subclass of MemDataset for the initialization of output dataset. if not isinstance(dataset, memh5.MemDataset): if distributed and isinstance(dataset, mpiarray.MPIArray): dataset = memh5.MemDatasetDistributed.from_mpi_array(dataset) else: dataset = memh5.MemDatasetCommon.from_numpy_array(dataset) # If this is the first piece of data, initialize the output # dataset. if name not in out: shape = dataset.shape dtype = dataset.dtype full_shape = shape[:axis_ind] full_shape += ((stop[axis] - start[axis]) * axis_rate,) full_shape += shape[axis_ind + 1:] if (distributed and isinstance(dataset, memh5.MemDatasetDistributed)): new_dset = out.create_dataset( name, shape=full_shape, dtype=dtype, distributed=True, distributed_axis=dataset.distributed_axis, ) else: new_dset = out.create_dataset(name, shape=full_shape, dtype=dtype) memh5.copyattrs(attrs, new_dset.attrs) out_dset = out[name] out_slice = (slice(None),) * axis_ind + (out_slice,) # Copy the data in. out_dtype = out_dset.dtype if (out_dtype.kind == 'V' and not out_dtype.fields and out_dtype.shape and isinstance(out_dset, h5py.Dataset)): # Awkward special case for pure subarray dtypes, which h5py and # numpy treat differently. # Drop down to low level interface. I think this is only # nessisary for pretty old h5py. from h5py import h5t from h5py._hl import selections mtype = h5t.py_create(out_dtype) mdata = dataset.copy().flat[:] mspace = selections.SimpleSelection( (mdata.size // out_dtype.itemsize,)).id fspace = selections.select(out_dset.shape, out_slice, out_dset.id).id out_dset.id.write(mspace, fspace, mdata, mtype) else: out_dset[out_slice] = dataset[:] # Increment the start indexes for the next item of the list. for axis in current_concat_index_start.keys(): current_concat_index_start[axis] += current_concat_index_n[axis] return out