def _g_open(self): (self.shape, self.byteorder, object_id) = self._open_unimplemented() try: self.nrows = SizeType(self.shape[0]) except IndexError: self.nrows = SizeType(0) return object_id
def _calc_chunkshape(self, expectedrows): """Calculate the size for the HDF5 chunk.""" # For computing the chunkshape for HDF5 VL types, we have to # choose the itemsize of the *each* element of the atom and # not the size of the entire atom. I don't know why this # should be like this, perhaps I should report this to the # HDF5 list. # F. Alted 2006-11-23 # elemsize = self.atom.atomsize() elemsize = self._basesize # AV 2013-05-03 # This is just a quick workaround tha allows to change the API for # PyTables 3.0 release and remove the expected_mb parameter. # The algorithm for computing the chunkshape should be rewritten as # requested by gh-35. expected_mb = expectedrows * elemsize / 1024.**2 chunksize = calc_chunksize(expected_mb) # Set the chunkshape chunkshape = chunksize // elemsize # Safeguard against itemsizes being extremely large if chunkshape == 0: chunkshape = 1 return (SizeType(chunkshape), )
def __init__(self, parentnode, name): """Create the `UnImplemented` instance.""" # UnImplemented objects always come from opening an existing node # (they can not be created). self._v_new = False """Is this the first time the node has been created?""" self.nrows = SizeType(0) """The length of the first dimension of the data.""" self.shape = (SizeType(0), ) """The shape of the stored data.""" self.byteorder = None """The endianness of data in memory ('big', 'little' or 'irrelevant').""" super(UnImplemented, self).__init__(parentnode, name)
def _g_copyWithStats(self, group, name, start, stop, step, title, filters, chunkshape, _log, **kwargs): "Private part of Leaf.copy() for each kind of leaf" # Build the new VLArray object object = VLArray( group, name, self.atom, title=title, filters=filters, expectedsizeinMB=self._v_expectedsizeinMB, chunkshape=chunkshape, _log=_log) # Now, fill the new vlarray with values from the old one # This is not buffered because we cannot forsee the length # of each record. So, the safest would be a copy row by row. # In the future, some analysis can be done in order to buffer # the copy process. nrowsinbuf = 1 (start, stop, step) = self._processRangeRead(start, stop, step) # Optimized version (no conversions, no type and shape checks, etc...) nrowscopied = SizeType(0) nbytes = 0 if not hasattr(self.atom, 'size'): # it is a pseudo-atom atomsize = self.atom.base.size else: atomsize = self.atom.size for start2 in lrange(start, stop, step*nrowsinbuf): # Save the records on disk stop2 = start2+step*nrowsinbuf if stop2 > stop: stop2 = stop nparr = self._readArray(start=start2, stop=stop2, step=step)[0] nobjects = nparr.shape[0] object._append(nparr, nobjects) nbytes += nobjects*atomsize nrowscopied +=1 object.nrows = nrowscopied return (object, nbytes)
def _initLoop(self): "Initialization for the __iter__ iterator" self._nrowsread = self._start self._startb = self._start self._row = -1 # Sentinel self._init = True # Sentinel self.nrow = SizeType(self._start - self._step) # row number
def _calc_chunkshape(self, expectedrows, rowsize, itemsize): """Calculate the shape for the HDF5 chunk.""" # In case of a scalar shape, return the unit chunksize if self.shape == (): return (SizeType(1), ) # Compute the chunksize MB = 1024 * 1024 expectedsizeinMB = (expectedrows * rowsize) / MB chunksize = calc_chunksize(expectedsizeinMB) maindim = self.maindim # Compute the chunknitems chunknitems = chunksize // itemsize # Safeguard against itemsizes being extremely large if chunknitems == 0: chunknitems = 1 chunkshape = list(self.shape) # Check whether trimming the main dimension is enough chunkshape[maindim] = 1 newchunknitems = numpy.prod(chunkshape, dtype=SizeType) if newchunknitems <= chunknitems: chunkshape[maindim] = chunknitems // newchunknitems else: # No, so start trimming other dimensions as well for j in xrange(len(chunkshape)): # Check whether trimming this dimension is enough chunkshape[j] = 1 newchunknitems = numpy.prod(chunkshape, dtype=SizeType) if newchunknitems <= chunknitems: chunkshape[j] = chunknitems // newchunknitems break else: # Ops, we ran out of the loop without a break # Set the last dimension to chunknitems chunkshape[-1] = chunknitems return tuple(SizeType(s) for s in chunkshape)
def _g_create(self): """Save a new array in file.""" self._v_version = obversion try: # `Leaf._g_postInitHook()` should be setting the flavor on disk. self._flavor = flavor = flavor_of(self._object) nparr = array_as_internal(self._object, flavor) except: #XXX # Problems converting data. Close the node and re-raise exception. self.close(flush=0) raise # Raise an error in case of unsupported object if nparr.dtype.kind in ['V', 'U', 'O']: # in void, unicode, object raise TypeError, \ "Array objects cannot currently deal with void, unicode or object arrays" # Decrease the number of references to the object self._object = None # The shape of this array self.shape = tuple(SizeType(s) for s in nparr.shape) # Fix the byteorder of data nparr = self._g_fix_byteorder_data(nparr, nparr.dtype.byteorder) # Create the array on-disk try: # ``self._v_objectID`` needs to be set because would be # needed for setting attributes in some descendants later # on (self._v_objectID, self.atom) = self._createArray(nparr, self._v_new_title) except: #XXX # Problems creating the Array on disk. Close node and re-raise. self.close(flush=0) raise # Compute the optimal buffer size chunkshape = self._calc_chunkshape(self.nrows, self.rowsize, self.atom.itemsize) self.nrowsinbuf = self._calc_nrowsinbuf(chunkshape, self.rowsize, self.atom.itemsize) # Arrays don't have chunkshapes (so, set it to None) self._v_chunkshape = None return self._v_objectID
def _calc_chunkshape(self, expectedsizeinMB): """Calculate the size for the HDF5 chunk.""" chunksize = calc_chunksize(expectedsizeinMB) # For computing the chunkshape for HDF5 VL types, we have to # choose the itemsize of the *each* element of the atom and # not the size of the entire atom. I don't know why this # should be like this, perhaps I should report this to the # HDF5 list. # F. Alted 2006-11-23 #elemsize = self.atom.atomsize() elemsize = self._basesize # Set the chunkshape chunkshape = chunksize // elemsize # Safeguard against itemsizes being extremely large if chunkshape == 0: chunkshape = 1 return (SizeType(chunkshape), )
def _normalize_shape(shape): """Check that the `shape` is safe to be used and return it as a tuple.""" if isinstance(shape, (int, numpy.integer, long)): if shape < 1: raise ValueError("shape value must be greater than 0: %d" % shape) shape = (shape, ) # N is a shorthand for (N,) try: shape = tuple(shape) except TypeError: raise TypeError("shape must be an integer or sequence: %r" % (shape, )) ## XXX Get from HDF5 library if possible. # HDF5 does not support ranks greater than 32 if len(shape) > 32: raise ValueError("shapes with rank > 32 are not supported: %r" % (shape, )) return tuple(SizeType(s) for s in shape)
def _g_create(self): """Create a variable length array (ragged array).""" atom = self.atom self._v_version = obversion # Check for zero dims in atom shape (not allowed in VLArrays) zerodims = numpy.sum(numpy.array(atom.shape) == 0) if zerodims > 0: raise ValueError, \ """When creating VLArrays, none of the dimensions of the Atom instance can be zero.""" if not hasattr(atom, 'size'): # it is a pseudo-atom self._atomicdtype = atom.base.dtype self._atomicsize = atom.base.size self._basesize = atom.base.itemsize else: self._atomicdtype = atom.dtype self._atomicsize = atom.size self._basesize = atom.itemsize self._atomictype = atom.type self._atomicshape = atom.shape # Compute the optimal chunkshape, if needed if self._v_chunkshape is None: self._v_chunkshape = self._calc_chunkshape( self._v_expectedsizeinMB) self.nrows = SizeType(0) # No rows at creation time # Correct the byteorder if needed if self.byteorder is None: self.byteorder = correct_byteorder(atom.type, sys.byteorder) # After creating the vlarray, ``self._v_objectID`` needs to be # set because it is needed for setting attributes afterwards. self._v_objectID = self._createArray(self._v_new_title) # Add an attribute in case we have a pseudo-atom so that we # can retrieve the proper class after a re-opening operation. if not hasattr(atom, 'size'): # it is a pseudo-atom self.attrs.PSEUDOATOM = atom.kind return self._v_objectID
def __init__(self, parentNode, name, atom=None, title="", filters=None, expectedsizeinMB=1.0, chunkshape=None, byteorder=None, _log=True): self._v_version = None """The object version of this array.""" self._v_new = new = atom is not None """Is this the first time the node has been created?""" self._v_new_title = title """New title for this node.""" self._v_new_filters = filters """New filter properties for this array.""" self._v_expectedsizeinMB = expectedsizeinMB """The expected size of the array in MiB.""" self._v_chunkshape = None """Private storage for the `chunkshape` property of Leaf.""" # Miscellaneous iteration rubbish. self._start = None """Starting row for the current iteration.""" self._stop = None """Stopping row for the current iteration.""" self._step = None """Step size for the current iteration.""" self._nrowsread = None """Number of rows read up to the current state of iteration.""" self._startb = None """Starting row for current buffer.""" self._stopb = None """Stopping row for current buffer. """ self._row = None """Current row in iterators (sentinel).""" self._init = False """Whether we are in the middle of an iteration or not (sentinel).""" self.listarr = None """Current buffer in iterators.""" # Documented (*public*) attributes. self.atom = atom """ An Atom (see :ref:`AtomClassDescr`) instance representing the *type* and *shape* of the atomic objects to be saved. You may use a *pseudo-atom* for storing a serialized object or variable length string per row. """ self.nrow = None """On iterators, this is the index of the current row.""" self.nrows = None """The current number of rows in the array.""" self.extdim = 0 # VLArray only have one dimension currently """The index of the enlargeable dimension (always 0 for vlarrays).""" # Check the chunkshape parameter if new and chunkshape is not None: if isinstance(chunkshape, (int, numpy.integer, long)): chunkshape = (chunkshape, ) try: chunkshape = tuple(chunkshape) except TypeError: raise TypeError( "`chunkshape` parameter must be an integer or sequence " "and you passed a %s" % type(chunkshape)) if len(chunkshape) != 1: raise ValueError("`chunkshape` rank (length) must be 1: %r" % (chunkshape, )) self._v_chunkshape = tuple(SizeType(s) for s in chunkshape) super(VLArray, self).__init__(parentNode, name, new, filters, byteorder, _log)
def __init__(self, parentnode, name, atom=None, shape=None, title="", filters=None, chunkshape=None, byteorder=None, _log=True): self.atom = atom """An `Atom` instance representing the shape, type of the atomic objects to be saved. """ self.shape = None """The shape of the stored array.""" self.extdim = -1 # `CArray` objects are not enlargeable by default """The index of the enlargeable dimension.""" # Other private attributes self._v_version = None """The object version of this array.""" self._v_new = new = atom is not None """Is this the first time the node has been created?""" self._v_new_title = title """New title for this node.""" self._v_convert = True """Whether the ``Array`` object must be converted or not.""" self._v_chunkshape = chunkshape """Private storage for the `chunkshape` property of the leaf.""" # Miscellaneous iteration rubbish. self._start = None """Starting row for the current iteration.""" self._stop = None """Stopping row for the current iteration.""" self._step = None """Step size for the current iteration.""" self._nrowsread = None """Number of rows read up to the current state of iteration.""" self._startb = None """Starting row for current buffer.""" self._stopb = None """Stopping row for current buffer. """ self._row = None """Current row in iterators (sentinel).""" self._init = False """Whether we are in the middle of an iteration or not (sentinel).""" self.listarr = None """Current buffer in iterators.""" if new: if not isinstance(atom, Atom): raise ValueError("atom parameter should be an instance of " "tables.Atom and you passed a %s." % type(atom)) if shape is None: raise ValueError("you must specify a non-empty shape") try: shape = tuple(shape) except TypeError: raise TypeError("`shape` parameter must be a sequence " "and you passed a %s" % type(shape)) self.shape = tuple(SizeType(s) for s in shape) if chunkshape is not None: try: chunkshape = tuple(chunkshape) except TypeError: raise TypeError( "`chunkshape` parameter must be a sequence " "and you passed a %s" % type(chunkshape)) if len(shape) != len(chunkshape): raise ValueError("the shape (%s) and chunkshape (%s) " "ranks must be equal." % (shape, chunkshape)) elif min(chunkshape) < 1: raise ValueError("chunkshape parameter cannot have " "zero-dimensions.") self._v_chunkshape = tuple(SizeType(s) for s in chunkshape) # The `Array` class is not abstract enough! :( super(Array, self).__init__(parentnode, name, new, filters, byteorder, _log)
def _getnrows(self): if self.shape == (): return SizeType(1) # scalar case else: return self.shape[self.maindim]
def _g_open(self): (self.shape, self.byteorder, objectID) = \ self._openUnImplemented() self.nrows = SizeType(self.shape[0]) return objectID
def __init__(self, parentNode, name, atom=None, shape=None, title="", filters=None, chunkshape=None, byteorder=None, _log=True): """ Create a `CArray` instance. `atom` An `Atom` instance representing the *type* and *shape* of the atomic objects to be saved. `shape` The shape of the new array. `title` A description for this node (it sets the ``TITLE`` HDF5 attribute on disk). `filters` An instance of the `Filters` class that provides information about the desired I/O filters to be applied during the life of this object. `chunkshape` The shape of the data chunk to be read or written in a single HDF5 I/O operation. Filters are applied to those chunks of data. The dimensionality of `chunkshape` must be the same as that of `shape`. If ``None``, a sensible value is calculated (which is recommended). `byteorder` The byteorder of the data *on disk*, specified as 'little' or 'big'. If this is not specified, the byteorder is that of the platform. """ self.atom = atom """ An `Atom` instance representing the shape, type of the atomic objects to be saved. """ self.shape = None """The shape of the stored array.""" self.extdim = -1 # `CArray` objects are not enlargeable by default """The index of the enlargeable dimension.""" # Other private attributes self._v_version = None """The object version of this array.""" self._v_new = new = atom is not None """Is this the first time the node has been created?""" self._v_new_title = title """New title for this node.""" self._v_convert = True """Whether the ``Array`` object must be converted or not.""" self._v_chunkshape = chunkshape """Private storage for the `chunkshape` property of the leaf.""" # Miscellaneous iteration rubbish. self._start = None """Starting row for the current iteration.""" self._stop = None """Stopping row for the current iteration.""" self._step = None """Step size for the current iteration.""" self._nrowsread = None """Number of rows read up to the current state of iteration.""" self._startb = None """Starting row for current buffer.""" self._stopb = None """Stopping row for current buffer. """ self._row = None """Current row in iterators (sentinel).""" self._init = False """Whether we are in the middle of an iteration or not (sentinel).""" self.listarr = None """Current buffer in iterators.""" if new: if not isinstance(atom, Atom): raise ValueError, """\ atom parameter should be an instance of tables.Atom and you passed a %s.""" \ % type(atom) if shape is None: raise ValueError("you must specify a non-empty shape") try: shape = tuple(shape) except TypeError: raise TypeError("`shape` parameter must be a sequence " "and you passed a %s" % type(shape)) self.shape = tuple(SizeType(s) for s in shape) if chunkshape is not None: try: chunkshape = tuple(chunkshape) except TypeError: raise TypeError( "`chunkshape` parameter must be a sequence " "and you passed a %s" % type(chunkshape)) if len(shape) != len(chunkshape): raise ValueError, """\ the shape (%s) and chunkshape (%s) ranks must be equal.""" \ % (shape, chunkshape) elif min(chunkshape) < 1: raise ValueError, """ \ chunkshape parameter cannot have zero-dimensions.""" self._v_chunkshape = tuple(SizeType(s) for s in chunkshape) # The `Array` class is not abstract enough! :( super(Array, self).__init__(parentNode, name, new, filters, byteorder, _log)
def __init__(self, parentNode, name, atom=None, title="", filters=None, expectedsizeinMB=1.0, chunkshape=None, byteorder=None, _log=True): """ Create a `VLArray` instance. `atom` An `Atom` instance representing the *type* and *shape* of the atomic objects to be saved. `title` A description for this node (it sets the ``TITLE`` HDF5 attribute on disk). `filters` An instance of the `Filters` class that provides information about the desired I/O filters to be applied during the life of this object. `expectedsizeinMB` An user estimate about the size (in MB) in the final `VLArray` object. If not provided, the default value is 1 MB. If you plan to create either a much smaller or a much bigger `VLArray` try providing a guess; this will optimize the HDF5 B-Tree creation and management process time and the amount of memory used. `chunkshape` The shape of the data chunk to be read or written in a single HDF5 I/O operation. Filters are applied to those chunks of data. The dimensionality of `chunkshape` must be 1. If ``None``, a sensible value is calculated (which is recommended). `byteorder` The byteorder of the data *on disk*, specified as 'little' or 'big'. If this is not specified, the byteorder is that of the platform. """ self._v_version = None """The object version of this array.""" self._v_new = new = atom is not None """Is this the first time the node has been created?""" self._v_new_title = title """New title for this node.""" self._v_new_filters = filters """New filter properties for this array.""" self._v_expectedsizeinMB = expectedsizeinMB """The expected size of the array in MiB.""" self._v_chunkshape = None """Private storage for the `chunkshape` property of Leaf.""" # Miscellaneous iteration rubbish. self._start = None """Starting row for the current iteration.""" self._stop = None """Stopping row for the current iteration.""" self._step = None """Step size for the current iteration.""" self._nrowsread = None """Number of rows read up to the current state of iteration.""" self._startb = None """Starting row for current buffer.""" self._stopb = None """Stopping row for current buffer. """ self._row = None """Current row in iterators (sentinel).""" self._init = False """Whether we are in the middle of an iteration or not (sentinel).""" self.listarr = None """Current buffer in iterators.""" # Documented (*public*) attributes. self.atom = atom """ An `Atom` instance representing the shape and type of the atomic objects to be saved. """ self.nrow = None """On iterators, this is the index of the current row.""" self.nrows = None """The total number of rows.""" self.extdim = 0 # VLArray only have one dimension currently """The index of the enlargeable dimension (always 0 for vlarrays).""" # Check the chunkshape parameter if new and chunkshape is not None: if isinstance(chunkshape, (int, numpy.integer, long)): chunkshape = (chunkshape, ) try: chunkshape = tuple(chunkshape) except TypeError: raise TypeError( "`chunkshape` parameter must be an integer or sequence " "and you passed a %s" % type(chunkshape)) if len(chunkshape) != 1: raise ValueError("`chunkshape` rank (length) must be 1: %r" % (chunkshape, )) self._v_chunkshape = tuple(SizeType(s) for s in chunkshape) super(VLArray, self).__init__(parentNode, name, new, filters, byteorder, _log)