Esempio n. 1
0
    def _create_vlarray(self, group: tables.Group, name: str,
                        data: VLArray) -> None:
        assert len(data), "VLArray must have at least one element"

        was_dict = isinstance(data, dict)
        if was_dict:
            data = [data[i] for i in sorted(data.keys())]

        types = set(type(v) for v in data)
        assert len(
            types) == 1, f"More than one type found in VLArray {name}: {types}"

        if str in types:
            atom = tables.VLUnicodeAtom()
        elif bytes in types:
            atom = tables.VLStringAtom()
        else:
            data = [
                v if isinstance(v, np.ndarray) else np.asarray(v) for v in data
            ]
            dtypes = set(v.dtype for v in data)
            assert len(
                dtypes
            ) == 1, f"More than one dtype found in VLArray {name}: {dtypes}"
            atom = tables.Atom.from_dtype(dtypes.pop())

        _d: tables.VLArray = self.h5f.create_vlarray(group,
                                                     name,
                                                     atom,
                                                     filters=self.filters)
        for v in data:
            _d.append(v)
        _d.set_attr("was_dict", was_dict)
        self.h5f.flush()
Esempio n. 2
0
vlarray = fileh.create_vlarray(root, 'vlarray6', tables.BoolAtom(),
                               "Boolean atoms")
# The next lines are equivalent...
vlarray.append([1, 0])
vlarray.append([1, 0, 3, 0])  # This will be converted to a boolean
# This gives a TypeError
# vlarray.append([1,0,1])

# Variable length strings
vlarray = fileh.create_vlarray(root, 'vlarray7', tables.VLStringAtom(),
                               "Variable Length String")
vlarray.append("asd")
vlarray.append("aaana")

# Unicode variable length strings
vlarray = fileh.create_vlarray(root, 'vlarray8', tables.VLUnicodeAtom(),
                               "Variable Length Unicode String")
vlarray.append("aaana")
vlarray.append("")   # The empty string
vlarray.append("asd")
vlarray.append("para\u0140lel")

# Close the file
fileh.close()

# Open the file for reading
fileh = tables.open_file("vlarray2.h5", mode="r")
# Get the root group
root = fileh.root

for object in fileh.list_nodes(root, "Leaf"):
Esempio n. 3
0
def write_h5f_strs(h5f, h5fplace, name, arr):
    s_place = h5f.createVLArray(h5fplace, name, tb.VLUnicodeAtom())
    for s in arr:
        s_place.append(s)
Esempio n. 4
0
    def import_idca(self,
                    filepath_or_buffer,
                    caseid_col,
                    altid_col,
                    choice_col=None,
                    force_int_as_float=True,
                    chunksize=1e300):
        """Import an existing CSV or similar file in idca format into this HDF5 file.
		
		This function relies on :func:`pandas.read_csv` to read and parse the input data.
		All arguments other than those described below are passed through to that function.
		
		Parameters
		----------
		filepath_or_buffer : str or buffer
			This argument will be fed directly to the :func:`pandas.read_csv` function.
		caseid_column : None or str
			If given, this is the column of the input data file to use as caseids.  It must be 
			given if the caseids do not already exist in the HDF5 file.  If it is given and
			the caseids do already exist, a `LarchError` is raised.
		altid_col : None or str
			If given, this is the column of the input data file to use as altids.  It must be
			given if the altids do not already exist in the HDF5 file.  If it is given and
			the altids do already exist, a `LarchError` is raised.
		choice_col : None or str
			If given, use this column as the choice indicator.
		force_int_as_float : bool
			If True, data columns that appear to be integer values will still be stored as 
			double precision floats (defaults to True).
		chunksize : int
			The number of rows of the source file to read as a chunk.  Reading a giant file in moderate sized
			chunks can be much faster and less memory intensive than reading the entire file.
		
		Returns
		-------
		DT
			self
		
		Raises
		------
		LarchError
			Various errors.
			
		Notes
		-----
		Chunking may not work on Mac OS X due to a `known bug <http://github.com/pydata/pandas/issues/11793>`_
		in the pandas.read_csv function.
		"""
        import pandas
        casealtreader = pandas.read_csv(filepath_or_buffer,
                                        chunksize=chunksize,
                                        usecols=[caseid_col, altid_col])
        caseids = numpy.array([], dtype='int64')
        altids = numpy.array([], dtype='int64')
        for chunk in casealtreader:
            caseids = numpy.union1d(caseids, chunk[caseid_col].values)
            altids = numpy.union1d(altids, chunk[altid_col].values)

        if caseids.dtype != numpy.int64:
            from ..util.arraytools import labels_to_unique_ids
            case_labels, caseids = labels_to_unique_ids(caseids)
            caseids = caseids.astype('int64')

        if 'caseids' not in self.h5top:
            self.h5f.create_carray(self.h5top, 'caseids', obj=caseids)
        else:
            if not numpy.all(caseids == self.h5caseids[:]):
                raise LarchError(
                    'caseids exist but do not match the imported data')

        alt_labels = None
        if 'altids' not in self.alts:
            if altids.dtype == numpy.int32:
                altids = altids.astype(numpy.int64)
            if altids.dtype != numpy.int64:
                from ..util.arraytools import labels_to_unique_ids
                alt_labels, altids = labels_to_unique_ids(altids)
            h5altids = self.h5f.create_carray(
                self.alts._v_node,
                'altids',
                obj=altids,
                title='elemental alternative code numbers')
        else:
            if not numpy.all(numpy.in1d(altids, self.alts.altids[:], True)):
                raise LarchError(
                    'altids exist but do not match the imported data')
            else:
                altids = self.alts.altids[:]
        if 'names' not in self.alts:
            h5altnames = self.h5f.create_vlarray(
                self.alts._v_node,
                'names',
                _tb.VLUnicodeAtom(),
                title='elemental alternative names')
            if alt_labels is not None:
                for an in alt_labels:
                    h5altnames.append(str(an))
            else:
                for an in self.alts.altids[:]:
                    h5altnames.append('a' + str(an))

        caseidmap = {i: n for n, i in enumerate(caseids)}
        altidmap = {i: n for n, i in enumerate(altids)}
        if alt_labels is not None:
            # if the altids are not integers, we replace the altid map with a labels map
            altidmap = {i: n for n, i in enumerate(alt_labels)}

        try:
            filepath_or_buffer.seek(0)
        except AttributeError:
            pass

        colreader = pandas.read_csv(filepath_or_buffer, nrows=1000)
        force_float_columns = {}
        h5arr = {}
        for col in colreader.columns:
            if col in (caseid_col, altid_col): continue
            if force_int_as_float and colreader[col].dtype == numpy.int64:
                atom_dtype = _tb.atom.Float64Atom()
                force_float_columns[col] = numpy.float64
            else:
                atom_dtype = _tb.Atom.from_dtype(colreader[col].dtype)
            h5arr[col] = self.h5f.create_carray(self.idca._v_node,
                                                col,
                                                atom_dtype,
                                                shape=(caseids.shape[0],
                                                       altids.shape[0]))
        if '_present_' not in colreader.columns:
            h5arr['_present_'] = self.h5f.create_carray(
                self.idca._v_node,
                '_present_',
                _tb.atom.BoolAtom(),
                shape=(caseids.shape[0], altids.shape[0]))

        try:
            filepath_or_buffer.seek(0)
        except AttributeError:
            pass

        reader = pandas.read_csv(filepath_or_buffer,
                                 chunksize=chunksize,
                                 dtype=force_float_columns,
                                 engine='c')
        try:
            for chunk in reader:
                casemap = chunk[caseid_col].map(caseidmap)
                altmap = chunk[altid_col].map(altidmap)
                for col in chunk.columns:
                    if col in (caseid_col, altid_col): continue
                    h5arr[col][casemap.values,
                               altmap.values] = chunk[col].values
                if '_present_' not in chunk.columns:
                    h5arr['_present_'][casemap.values, altmap.values] = True
        except:
            self._chunk = chunk
            self._casemap = casemap
            self._altmap = altmap
            self._altidmap = altidmap
            raise

        self.h5f.create_soft_link(self.idca._v_node,
                                  '_avail_',
                                  target=self.idca._v_node._v_pathname +
                                  '/_present_')

        if choice_col:
            if isinstance(self.idca._v_children[choice_col].atom,
                          _tb.atom.Float64Atom):
                self.h5f.create_soft_link(self.idca._v_node,
                                          '_choice_',
                                          target=self.idca._v_pathname + '/' +
                                          choice_col)
            else:
                self.h5f.create_carray(
                    self.idca._v_node,
                    '_choice_',
                    obj=self.idca._v_children[choice_col][:].astype('Float64'))

        return self
Esempio n. 5
0
    def FromDB(cls, db, filename=None, temp=True):
        '''Generate a DT data file from a DB file.
		
		Larch comes with a few example data sets, which are used in documentation
		and testing. This function copies the data into a HDF5 file, which you can
		freely edit without damaging the original data.
		
		Parameters
		----------
		db : DB
			Which example dataset should be used.
		filename : str
			A filename to open the HDF5 file (even in-memory files need a name).
		temp : bool
			The example database be created in-memory; if `temp` is false,
			the file will be dumped to disk when closed.
			
		Returns
		-------
		DT
			An open connection to the HDF5 example data.
		
		'''

        h5filters = _tb.Filters(complevel=1)

        if filename is None:
            filename = '{}.h5'.format(
                os.path.splitext(os.path.basename(db.source_filename))[0])

        from ..util.filemanager import next_stack
        n = 0
        while 1:
            try:
                tryname = next_stack(filename, plus=n, allow_natural=(n == 0))
                h5f = _tb.open_file(tryname,
                                    'w',
                                    filters=h5filters,
                                    driver="H5FD_CORE",
                                    driver_core_backing_store=0 if temp else 1)
            except ValueError:
                n += 1
                if n > 1000:
                    raise RuntimeError(
                        "cannot open HDF5 at {}".format(filename))
            else:
                break

        from ..db import DB
        if not isinstance(db, DB):
            raise TypeError('db must be DB')

        edb = db
        self = cls(filename, 'w', h5f=h5f)

        descrip_larch = {}
        descrip_alts = {
            'altid': _tb.Int64Col(pos=1, dflt=-999),
            'name': _tb.StringCol(itemsize=127, pos=2, dflt=""),
        }
        descrip_co = {}
        descrip_ca = {}
        vars_co = edb.variables_co()
        vars_ca = edb.variables_ca()
        for i in vars_co:
            if i == 'caseid':
                descrip_co[i] = _tb.Int64Col(pos=len(descrip_co), dflt=-999)
            else:
                descrip_co[i] = _tb.Float64Col(pos=len(descrip_co),
                                               dflt=numpy.nan)
        for i in vars_ca:
            if i in ('caseid', 'altid'):
                descrip_ca[i] = _tb.Int64Col(pos=len(descrip_ca), dflt=-999)
            else:
                descrip_ca[i] = _tb.Float64Col(pos=len(descrip_ca),
                                               dflt=numpy.nan)

        larchnode = h5f._get_or_create_path("/larch", True)
        larchidca = h5f._get_or_create_path("/larch/idca", True)
        larchidco = h5f._get_or_create_path("/larch/idco", True)
        larchalts = h5f._get_or_create_path("/larch/alts", True)

        for var_ca in vars_ca:
            if var_ca not in ('caseid', 'casenum', 'IDCASE'):
                h5var = h5f.create_carray(larchidca,
                                          var_ca,
                                          _tb.Float64Atom(),
                                          shape=(edb.nCases(), edb.nAlts()),
                                          filters=h5filters)
                arr, caseids = edb.array_idca(var_ca)
                h5var[:, :] = arr.squeeze()

        for var_co in vars_co:
            if var_co not in ('caseid', 'casenum', 'IDCASE'):
                h5var = h5f.create_carray(larchidco,
                                          var_co,
                                          _tb.Float64Atom(),
                                          shape=(edb.nCases(), ),
                                          filters=h5filters)
                arr, caseids = edb.array_idco(var_co)
                h5var[:] = arr.squeeze()

        h5caseids = h5f.create_carray(larchnode,
                                      'caseids',
                                      _tb.Int64Atom(),
                                      shape=(edb.nCases(), ),
                                      filters=h5filters)
        h5caseids[:] = caseids.squeeze()

        h5scrn = h5f.create_carray(larchnode,
                                   'screen',
                                   _tb.BoolAtom(),
                                   shape=(edb.nCases(), ),
                                   filters=h5filters)
        h5scrn[:] = True

        h5altids = h5f.create_carray(
            larchalts,
            'altids',
            _tb.Int64Atom(),
            shape=(edb.nAlts(), ),
            filters=h5filters,
            title='elemental alternative code numbers')
        h5altids[:] = edb.alternative_codes()

        h5altnames = h5f.create_vlarray(larchalts,
                                        'names',
                                        _tb.VLUnicodeAtom(),
                                        filters=h5filters,
                                        title='elemental alternative names')
        for an in edb.alternative_names():
            h5altnames.append(str(an))

        if isinstance(edb.queries.avail, (dict, IntStringDict)):
            self.avail_idco = dict(edb.queries.avail)
        else:
            h5avail = h5f.create_carray(larchidca,
                                        '_avail_',
                                        _tb.BoolAtom(),
                                        shape=(edb.nCases(), edb.nAlts()),
                                        filters=h5filters)
            arr, caseids = edb.array_avail()
            h5avail[:, :] = arr.squeeze()

        try:
            ch_ca = edb.queries.get_choice_ca()
            h5f.create_soft_link(larchidca,
                                 '_choice_',
                                 target='/larch/idca/' + ch_ca)
        except AttributeError:
            h5ch = h5f.create_carray(larchidca,
                                     '_choice_',
                                     _tb.Float64Atom(),
                                     shape=(edb.nCases(), edb.nAlts()),
                                     filters=h5filters)
            arr, caseids = edb.array_choice()
            h5ch[:, :] = arr.squeeze()

        wgt = edb.queries.weight
        if wgt:
            h5f.create_soft_link(larchidco,
                                 '_weight_',
                                 target='/larch/idco/' + wgt)

        return self