コード例 #1
0
ファイル: dataset.py プロジェクト: codefour-gmbh/vaex
 def write_meta(self):
     """ucds, descriptions and units are written as attributes in the hdf5 file, instead of a seperate file as
      the default :func:`Dataset.write_meta`.
      """
     with h5py.File(self.filename, "r+") as h5file_output:
         h5table_root = h5file_output[self.h5table_root_name]
         if self.description is not None:
             h5table_root.attrs["description"] = self.description
         h5columns = h5table_root if self._version == 1 else h5table_root['columns']
         for column_name in self.columns.keys():
             h5dataset = None
             if column_name in h5columns:
                 h5dataset = h5columns[column_name]
             else:
                 for group in h5columns.values():
                     if 'type' in group.attrs:
                         if group.attrs['type'] in ['csr_matrix']: 
                             for name, column in group.items():
                                 if name == column_name:
                                     h5dataset = column
             if h5dataset is None:
                 raise ValueError('column {} not found'.format(column_name))
             for name, values in [("ucd", self.ucds), ("unit", self.units), ("description", self.descriptions)]:
                 if column_name in values:
                     value = ensure_string(values[column_name], cast=True)
                     h5dataset.attrs[name] = value
                 else:
                     if name in h5columns.attrs:
                         del h5dataset.attrs[name]
コード例 #2
0
ファイル: dataset.py プロジェクト: yunstanford/vaex
 def write_meta(self):
     """ucds, descriptions and units are written as attributes in the hdf5 file, instead of a seperate file as
      the default :func:`Dataset.write_meta`.
      """
     with h5py.File(self.filename, "r+") as h5file_output:
         h5table_root = h5file_output[self.h5table_root_name]
         if self.description is not None:
             h5table_root.attrs["description"] = self.description
         h5columns = h5table_root if self._version == 1 else h5table_root[
             'columns']
         for column_name in self.columns.keys():
             h5dataset = h5columns[column_name]
             for name, values in [("ucd", self.ucds), ("unit", self.units),
                                  ("description", self.descriptions)]:
                 if column_name in values:
                     value = ensure_string(values[column_name], cast=True)
                     h5dataset.attrs[name] = value
                 else:
                     if name in h5columns.attrs:
                         del h5dataset.attrs[name]
コード例 #3
0
    def _load_columns(self, h5data, first=[]):
        # print h5data
        # make sure x y x etc are first

        finished = set()
        if "description" in h5data.attrs:
            self.description = ensure_string(h5data.attrs["description"])
        # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later
        h5columns = h5data if self._version == 1 else h5data['columns']
        if "column_order" in h5columns.attrs:
            column_order = ensure_string(
                h5columns.attrs["column_order"]).split(",")
        else:
            column_order = []
        # for name in list(h5columns):
        #     if name not in column_order:
        #         column_order.append(name)
        # for column_name in column_order:
        # if column_name in h5columns and column_name not in finished:
        for group_name in list(h5columns):
            logger.debug('loading column: %s', group_name)
            group = h5columns[group_name]
            if 'type' in group.attrs:
                if group.attrs['type'] in ['csr_matrix']:
                    from scipy.sparse import csc_matrix, csr_matrix

                    class csr_matrix_nocheck(csr_matrix):
                        def check_format(self, *args, **kwargs):
                            pass

                    data = self._map_hdf5_array(group['data'])
                    indptr = self._map_hdf5_array(group['indptr'])
                    indices = self._map_hdf5_array(group['indices'])
                    #column_names = ensure_string(group.attrs["column_names"]).split(",")
                    # make sure we keep the original order
                    groups = [(name, value) for name, value in group.items()
                              if isinstance(value, h5py.Group)]
                    column_names = [None] * len(groups)
                    for name, column in groups:
                        column_names[column.attrs['column_index']] = name
                    matrix = csr_matrix_nocheck(
                        (data, indices, indptr),
                        shape=(len(indptr) - 1, len(column_names)))
                    assert matrix.data is data
                    # assert matrix.indptr is indptr
                    assert matrix.indices is indices
                    self.add_columns(column_names, matrix)
            else:
                column_name = group_name
                column = h5columns[column_name]
                if "alias" in column.attrs:
                    column_name = column.attrs["alias"]
                if "ucd" in column.attrs:
                    self.ucds[column_name] = ensure_string(column.attrs["ucd"])
                if "description" in column.attrs:
                    self.descriptions[column_name] = ensure_string(
                        column.attrs["description"])
                if "unit" in column.attrs:
                    try:
                        unitname = ensure_string(column.attrs["unit"])
                        if unitname and unitname != "None":
                            self.units[column_name] = _try_unit(unitname)
                    except:
                        logger.exception("error parsing unit: %s",
                                         column.attrs["unit"])
                if "units" in column.attrs:  # Amuse case
                    unitname = ensure_string(column.attrs["units"])
                    logger.debug("amuse unit: %s", unitname)
                    if unitname == "(0.01 * system.get('S.I.').base('length'))":
                        self.units[column_name] = astropy.units.Unit("cm")
                    if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("cm/s")
                    if unitname == "(0.001 * system.get('S.I.').base('mass'))":
                        self.units[column_name] = astropy.units.Unit("gram")

                    if unitname == "system.get('S.I.').base('length')":
                        self.units[column_name] = astropy.units.Unit("m")
                    if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("m/s")
                    if unitname == "system.get('S.I.').base('mass')":
                        self.units[column_name] = astropy.units.Unit("kg")
                data = column if self._version == 1 else column['data']
                if hasattr(data, "dtype"):
                    if "dtype" in data.attrs and data.attrs["dtype"] == "str":
                        indices = self._map_hdf5_array(column['indices'])
                        bytes = self._map_hdf5_array(data)
                        if "null_bitmap" in column:
                            null_bitmap = self._map_hdf5_array(
                                column['null_bitmap'])
                        else:
                            null_bitmap = None
                        if isinstance(
                                indices,
                                np.ndarray):  # this is a real mmappable file
                            self.add_column(
                                column_name,
                                vaex.arrow.convert.
                                arrow_string_array_from_buffers(
                                    bytes, indices, null_bitmap))
                        else:
                            # if not a reall mmappable array, we fall back to this, maybe we can generalize this
                            self.add_column(
                                column_name,
                                ColumnStringArrow(indices,
                                                  bytes,
                                                  null_bitmap=null_bitmap))
                    else:
                        shape = data.shape
                        if True:  # len(shape) == 1:
                            dtype = data.dtype
                            if "dtype" in data.attrs:
                                dtype = data.attrs["dtype"]
                            logger.debug("adding column %r with dtype %r",
                                         column_name, dtype)
                            # self.addColumn(column_name, offset, len(data), dtype=dtype)
                            if self._version > 1 and 'mask' in column:
                                self.add_column(
                                    column_name,
                                    self._map_hdf5_array(data, column['mask']))
                            else:
                                self.add_column(column_name,
                                                self._map_hdf5_array(data))
                        else:
                            transposed = shape[1] < shape[0]
                            self.addRank1(column_name,
                                          offset,
                                          shape[1],
                                          length1=shape[0],
                                          dtype=data.dtype,
                                          stride=1,
                                          stride1=1,
                                          transposed=transposed)
        all_columns = dict(**self._columns)
        # in case the column_order refers to non-existing columns
        column_order = [k for k in column_order if k in all_columns]
        column_names = []
        self._columns = {}
        for name in column_order:
            self._columns[name] = all_columns.pop(name)
        # add the rest
        for name, col in all_columns.items():
            self._columns[name] = col
コード例 #4
0
ファイル: dataset.py プロジェクト: yunstanford/vaex
    def _load_columns(self, h5data, first=[]):
        # print h5data
        # make sure x y x etc are first

        finished = set()
        if "description" in h5data.attrs:
            self.description = ensure_string(h5data.attrs["description"])
        # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later
        h5columns = h5data if self._version == 1 else h5data['columns']
        if "column_order" in h5columns.attrs:
            column_order = ensure_string(
                h5columns.attrs["column_order"]).split(",")
        else:
            column_order = []
        for name in list(h5columns):
            if name not in column_order:
                column_order.append(name)
        for column_name in column_order:
            if column_name in h5columns and column_name not in finished:
                column = h5columns[column_name]
                if "ucd" in column.attrs:
                    self.ucds[column_name] = ensure_string(column.attrs["ucd"])
                if "description" in column.attrs:
                    self.descriptions[column_name] = ensure_string(
                        column.attrs["description"])
                if "unit" in column.attrs:
                    try:
                        unitname = ensure_string(column.attrs["unit"])
                        if unitname and unitname != "None":
                            self.units[column_name] = _try_unit(unitname)
                    except:
                        logger.exception("error parsing unit: %s",
                                         column.attrs["unit"])
                if "units" in column.attrs:  # Amuse case
                    unitname = ensure_string(column.attrs["units"])
                    logger.debug("amuse unit: %s", unitname)
                    if unitname == "(0.01 * system.get('S.I.').base('length'))":
                        self.units[column_name] = astropy.units.Unit("cm")
                    if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("cm/s")
                    if unitname == "(0.001 * system.get('S.I.').base('mass'))":
                        self.units[column_name] = astropy.units.Unit("gram")

                    if unitname == "system.get('S.I.').base('length')":
                        self.units[column_name] = astropy.units.Unit("m")
                    if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("m/s")
                    if unitname == "system.get('S.I.').base('mass')":
                        self.units[column_name] = astropy.units.Unit("kg")
                data = column if self._version == 1 else column['data']
                if hasattr(data, "dtype"):
                    # print column, column.shape
                    offset = data.id.get_offset()
                    if offset is None:
                        raise Exception(
                            "columns doesn't really exist in hdf5 file")
                    shape = data.shape
                    if True:  # len(shape) == 1:
                        dtype = data.dtype
                        if "dtype" in data.attrs:
                            dtype = data.attrs["dtype"]
                        logger.debug("adding column %r with dtype %r",
                                     column_name, dtype)
                        self.addColumn(column_name,
                                       offset,
                                       len(data),
                                       dtype=dtype)
                        if self._version > 1 and 'mask' in column:
                            mask = column['mask']
                            offset = mask.id.get_offset()
                            self.addColumn("temp_mask",
                                           offset,
                                           len(data),
                                           dtype=mask.dtype)
                            mask_array = self.columns['temp_mask']
                            del self.columns['temp_mask']
                            self.column_names.remove('temp_mask')
                            ar = self.columns[column_name] = np.ma.array(
                                self.columns[column_name],
                                mask=mask_array,
                                shrink=False)
                            assert ar.mask is mask_array, "masked array was copied"

                    else:

                        # transposed = self._length is None or shape[0] == self._length
                        transposed = shape[1] < shape[0]
                        self.addRank1(column_name,
                                      offset,
                                      shape[1],
                                      length1=shape[0],
                                      dtype=data.dtype,
                                      stride=1,
                                      stride1=1,
                                      transposed=transposed)
                        # if len(shape[0]) == self._length:
                        # self.addRank1(column_name, offset, shape[1], length1=shape[0], dtype=column.dtype, stride=1, stride1=1)
                        # self.addColumn(column_name+"_0", offset, shape[1], dtype=column.dtype)
                        # self.addColumn(column_name+"_last", offset+(shape[0]-1)*shape[1]*column.dtype.itemsize, shape[1], dtype=column.dtype)
                        # self.addRank1(name, offset+8*i, length=self.numberParticles+1, length1=self.numberTimes-1, dtype=np.float64, stride=stride, stride1=1, filename=filename_extra)
            finished.add(column_name)
コード例 #5
0
ファイル: dataset.py プロジェクト: maartenbreddels/vaex
    def _load_columns(self, h5data, first=[]):
        # print h5data
        # make sure x y x etc are first

        finished = set()
        if "description" in h5data.attrs:
            self.description = ensure_string(h5data.attrs["description"])
        # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later
        h5columns = h5data if self._version == 1 else h5data['columns']
        if "column_order" in h5columns.attrs:
            column_order = ensure_string(h5columns.attrs["column_order"]).split(",")
        else:
            column_order = []
        # for name in list(h5columns):
        #     if name not in column_order:
        #         column_order.append(name)
        # for column_name in column_order:
            # if column_name in h5columns and column_name not in finished:
        for group_name in list(h5columns):
            group = h5columns[group_name]
            if 'type' in group.attrs:
                if group.attrs['type'] in ['csr_matrix']:
                    from scipy.sparse import csc_matrix, csr_matrix
                    class csr_matrix_nocheck(csr_matrix):
                        def check_format(self, *args, **kwargs):
                            pass
                    data = self._map_hdf5_array(group['data'])
                    indptr = self._map_hdf5_array(group['indptr'])
                    indices = self._map_hdf5_array(group['indices'])
                    #column_names = ensure_string(group.attrs["column_names"]).split(",")
                    # make sure we keep the original order
                    groups = [(name, value) for name, value in group.items() if isinstance(value, h5py.Group)]
                    column_names = [None] * len(groups)
                    for name, column in groups:
                        column_names[column.attrs['column_index']] = name
                    matrix = csr_matrix_nocheck((data, indices, indptr), shape=(len(indptr)-1, len(column_names)))
                    assert matrix.data is data
                    # assert matrix.indptr is indptr
                    assert matrix.indices is indices
                    self.add_columns(column_names, matrix)
            else:
                column_name = group_name
                column = h5columns[column_name]
                if "ucd" in column.attrs:
                    self.ucds[column_name] = ensure_string(column.attrs["ucd"])
                if "description" in column.attrs:
                    self.descriptions[column_name] = ensure_string(column.attrs["description"])
                if "unit" in column.attrs:
                    try:
                        unitname = ensure_string(column.attrs["unit"])
                        if unitname and unitname != "None":
                            self.units[column_name] = _try_unit(unitname)
                    except:
                        logger.exception("error parsing unit: %s", column.attrs["unit"])
                if "units" in column.attrs:  # Amuse case
                    unitname = ensure_string(column.attrs["units"])
                    logger.debug("amuse unit: %s", unitname)
                    if unitname == "(0.01 * system.get('S.I.').base('length'))":
                        self.units[column_name] = astropy.units.Unit("cm")
                    if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("cm/s")
                    if unitname == "(0.001 * system.get('S.I.').base('mass'))":
                        self.units[column_name] = astropy.units.Unit("gram")

                    if unitname == "system.get('S.I.').base('length')":
                        self.units[column_name] = astropy.units.Unit("m")
                    if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("m/s")
                    if unitname == "system.get('S.I.').base('mass')":
                        self.units[column_name] = astropy.units.Unit("kg")
                data = column if self._version == 1 else column['data']
                if hasattr(data, "dtype"):
                    if "dtype" in data.attrs and data.attrs["dtype"] == "str":
                        indices = self._map_hdf5_array(column['indices'])
                        bytes = self._map_hdf5_array(data)
                        if "null_bitmap" in column:
                            null_bitmap = self._map_hdf5_array(column['null_bitmap'])
                        else:
                            null_bitmap = None
                        from vaex.column import ColumnStringArrow
                        self.add_column(column_name, ColumnStringArrow(indices, bytes, null_bitmap=null_bitmap))
                    else:
                        shape = data.shape
                        if True:  # len(shape) == 1:
                            dtype = data.dtype
                            if "dtype" in data.attrs:
                                dtype = data.attrs["dtype"]
                            logger.debug("adding column %r with dtype %r", column_name, dtype)
                            # self.addColumn(column_name, offset, len(data), dtype=dtype)
                            if self._version > 1 and 'mask' in column:
                                self.add_column(column_name, self._map_hdf5_array(data, column['mask']))
                            else:
                                self.add_column(column_name, self._map_hdf5_array(data))
                        else:
                            transposed = shape[1] < shape[0]
                            self.addRank1(column_name, offset, shape[1], length1=shape[0], dtype=data.dtype, stride=1, stride1=1, transposed=transposed)
        all_columns = dict(**self.columns)
        # print(all_columns, column_order)
        self.column_names = []
        for name in column_order:
            self.columns[name] = all_columns.pop(name)
            self.column_names.append(name)
        # add the rest
        for name, col in all_columns.items():
            self.columns[name] = col
            self.column_names.append(name)
コード例 #6
0
ファイル: other.py プロジェクト: quiquinSP/vaex
	def _load_columns(self, h5data):
		#print h5data
		# make sure x y x etc are first
		first = "x y z vx vy vz".split()
		finished = set()
		if "description" in h5data.attrs:
			self.description = ensure_string(h5data.attrs["description"])
		for column_name in first + list(h5data):
			if column_name in h5data and column_name not in finished:
				#print type(column_name)
				column = h5data[column_name]
				if "ucd" in column.attrs:
					self.ucds[column_name] = ensure_string(column.attrs["ucd"])
				if "description" in column.attrs:
					self.descriptions[column_name] = ensure_string(column.attrs["description"])
				if "unit" in column.attrs:
					try:
						unitname = ensure_string(column.attrs["unit"])
						if unitname and unitname != "None":
							self.units[column_name] = astropy.units.Unit(unitname)
					except:
						logger.exception("error parsing unit: %s", column.attrs["unit"])
				if "units" in column.attrs: # Amuse case
					unitname = ensure_string(column.attrs["units"])
					logger.debug("amuse unit: %s", unitname)
					if unitname == "(0.01 * system.get('S.I.').base('length'))":
						self.units[column_name] = astropy.units.Unit("cm")
					if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))":
						self.units[column_name] = astropy.units.Unit("cm/s")
					if unitname == "(0.001 * system.get('S.I.').base('mass'))":
						self.units[column_name] = astropy.units.Unit("gram")

					if unitname == "system.get('S.I.').base('length')":
						self.units[column_name] = astropy.units.Unit("m")
					if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))":
						self.units[column_name] = astropy.units.Unit("m/s")
					if unitname == "system.get('S.I.').base('mass')":
						self.units[column_name] = astropy.units.Unit("kg")

				if hasattr(column, "dtype"):
					#print column, column.shape
					offset = column.id.get_offset()
					if offset is None:
						raise Exception("columns doesn't really exist in hdf5 file")
					shape = column.shape
					if True: #len(shape) == 1:
						dtype = column.dtype
						if "dtype" in column.attrs:
							dtype = column.attrs["dtype"]
						logger.debug("adding column %r with dtype %r", column_name, dtype)
						self.addColumn(column_name, offset, len(column), dtype=dtype)
					else:

						#transposed = self._length is None or shape[0] == self._length
						transposed = shape[1] < shape[0]
						self.addRank1(column_name, offset, shape[1], length1=shape[0], dtype=column.dtype, stride=1, stride1=1, transposed=transposed)
						#if len(shape[0]) == self._length:
						#	self.addRank1(column_name, offset, shape[1], length1=shape[0], dtype=column.dtype, stride=1, stride1=1)
						#self.addColumn(column_name+"_0", offset, shape[1], dtype=column.dtype)
						#self.addColumn(column_name+"_last", offset+(shape[0]-1)*shape[1]*column.dtype.itemsize, shape[1], dtype=column.dtype)
						#self.addRank1(name, offset+8*i, length=self.numberParticles+1, length1=self.numberTimes-1, dtype=np.float64, stride=stride, stride1=1, filename=filename_extra)
			finished.add(column_name)
コード例 #7
0
ファイル: dataset.py プロジェクト: t-triobox/vaex
    def _load_columns(self, h5data, first=[]):
        # print h5data
        # make sure x y x etc are first

        finished = set()
        if "description" in h5data.attrs:
            self.description = ensure_string(h5data.attrs["description"])
        # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later
        h5columns = h5data if self._version == 1 else h5data['columns']
        if "column_order" in h5columns.attrs:
            column_order = ensure_string(
                h5columns.attrs["column_order"]).split(",")
        else:
            column_order = []
        # for name in list(h5columns):
        #     if name not in column_order:
        #         column_order.append(name)
        # for column_name in column_order:
        # if column_name in h5columns and column_name not in finished:
        for group_name in list(h5columns):
            logger.debug('loading column: %s', group_name)
            group = h5columns[group_name]
            if 'type' in group.attrs:
                type = group.attrs['type']
                if type in ['csr_matrix']:
                    from scipy.sparse import csc_matrix, csr_matrix

                    class csr_matrix_nocheck(csr_matrix):
                        def check_format(self, *args, **kwargs):
                            pass

                    data = self._map_hdf5_array(group['data'])
                    indptr = self._map_hdf5_array(group['indptr'])
                    indices = self._map_hdf5_array(group['indices'])
                    #column_names = ensure_string(group.attrs["column_names"]).split(",")
                    # make sure we keep the original order
                    groups = [(name, value) for name, value in group.items()
                              if isinstance(value, h5py.Group)]
                    column_names = [None] * len(groups)
                    for name, column in groups:
                        column_names[column.attrs['column_index']] = name
                    matrix = csr_matrix_nocheck(
                        (data, indices, indptr),
                        shape=(len(indptr) - 1, len(column_names)))
                    assert matrix.data is data
                    # assert matrix.indptr is indptr
                    assert matrix.indices is indices
                    self.add_columns(column_names, matrix)
                if type == 'dictionary_encoded':
                    index = self._map_column(group['indices'], as_arrow=True)
                    values = self._map_column(group['dictionary'],
                                              as_arrow=True)
                    if 'null_bitmap' in group['indices'] or 'mask' in group[
                            'indices']:
                        raise ValueError(
                            f'Did not expect null data in encoded column {group_name}'
                        )
                    if isinstance(values, vaex.column.Column):
                        encoded = vaex.column.ColumnArrowDictionaryEncoded(
                            index, values)
                    else:
                        encoded = pa.DictionaryArray.from_arrays(index, values)
                    self.add_column(group_name, encoded)
                else:
                    raise TypeError(
                        f'Unexpected type {type!r} in {group_name}')
            else:
                column_name = group_name
                column = h5columns[column_name]
                if "alias" in column.attrs:
                    column_name = column.attrs["alias"]
                if "ucd" in column.attrs:
                    self.ucds[column_name] = ensure_string(column.attrs["ucd"])
                if "description" in column.attrs:
                    self.descriptions[column_name] = ensure_string(
                        column.attrs["description"])
                if "unit" in column.attrs:
                    try:
                        unitname = ensure_string(column.attrs["unit"])
                        if unitname and unitname != "None":
                            self.units[column_name] = _try_unit(unitname)
                    except:
                        logger.exception("error parsing unit: %s",
                                         column.attrs["unit"])
                if "units" in column.attrs:  # Amuse case
                    unitname = ensure_string(column.attrs["units"])
                    logger.debug("amuse unit: %s", unitname)
                    if unitname == "(0.01 * system.get('S.I.').base('length'))":
                        self.units[column_name] = astropy.units.Unit("cm")
                    if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("cm/s")
                    if unitname == "(0.001 * system.get('S.I.').base('mass'))":
                        self.units[column_name] = astropy.units.Unit("gram")

                    if unitname == "system.get('S.I.').base('length')":
                        self.units[column_name] = astropy.units.Unit("m")
                    if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("m/s")
                    if unitname == "system.get('S.I.').base('mass')":
                        self.units[column_name] = astropy.units.Unit("kg")
                if self._version == 1:
                    column = self._map_hdf5_array(column)
                    self.add_column(column_name, column)
                elif hasattr(column["data"], "dtype"):
                    column = self._map_column(column)
                    self.add_column(column_name, column)
                    dtype = vaex.dtype_of(column)
                    logger.debug("adding column %r with dtype %r", column_name,
                                 dtype)
                else:
                    raise TypeError(f'{group_name} is missing dtype')

        all_columns = dict(**self._columns)
        # in case the column_order refers to non-existing columns
        column_order = [k for k in column_order if k in all_columns]
        column_names = []
        self._columns = {}
        for name in column_order:
            self._columns[name] = all_columns.pop(name)
        # add the rest
        for name, col in all_columns.items():
            self._columns[name] = col
コード例 #8
0
ファイル: dataset.py プロジェクト: zennsocial/vaex
    def _load_columns(self, h5data, first=[]):
        # print h5data
        # make sure x y x etc are first

        finished = set()
        if "description" in h5data.attrs:
            self.description = ensure_string(h5data.attrs["description"])
        # hdf5, or h5py doesn't keep the order of columns, so manually track that, also enables reordering later
        h5columns = h5data if self._version == 1 else h5data['columns']
        if "column_order" in h5columns.attrs:
            column_order = ensure_string(
                h5columns.attrs["column_order"]).split(",")
        else:
            column_order = []
        # for name in list(h5columns):
        #     if name not in column_order:
        #         column_order.append(name)
        # for column_name in column_order:
        # if column_name in h5columns and column_name not in finished:
        for group_name in list(h5columns):
            group = h5columns[group_name]
            if 'type' in group.attrs:
                if group.attrs['type'] in ['csr_matrix']:
                    from scipy.sparse import csc_matrix, csr_matrix

                    class csr_matrix_nocheck(csr_matrix):
                        def check_format(self, *args, **kwargs):
                            pass

                    data = self._map_hdf5_array(group['data'])
                    indptr = self._map_hdf5_array(group['indptr'])
                    indices = self._map_hdf5_array(group['indices'])
                    #column_names = ensure_string(group.attrs["column_names"]).split(",")
                    # make sure we keep the original order
                    groups = [(name, value) for name, value in group.items()
                              if isinstance(value, h5py.Group)]
                    column_names = [None] * len(groups)
                    for name, column in groups:
                        column_names[column.attrs['column_index']] = name
                    matrix = csr_matrix_nocheck(
                        (data, indices, indptr),
                        shape=(len(indptr) - 1, len(column_names)))
                    assert matrix.data is data
                    # assert matrix.indptr is indptr
                    assert matrix.indices is indices
                    self.add_columns(column_names, matrix)
            else:
                column_name = group_name
                column = h5columns[column_name]
                if "ucd" in column.attrs:
                    self.ucds[column_name] = ensure_string(column.attrs["ucd"])
                if "description" in column.attrs:
                    self.descriptions[column_name] = ensure_string(
                        column.attrs["description"])
                if "unit" in column.attrs:
                    try:
                        unitname = ensure_string(column.attrs["unit"])
                        if unitname and unitname != "None":
                            self.units[column_name] = _try_unit(unitname)
                    except:
                        logger.exception("error parsing unit: %s",
                                         column.attrs["unit"])
                if "units" in column.attrs:  # Amuse case
                    unitname = ensure_string(column.attrs["units"])
                    logger.debug("amuse unit: %s", unitname)
                    if unitname == "(0.01 * system.get('S.I.').base('length'))":
                        self.units[column_name] = astropy.units.Unit("cm")
                    if unitname == "((0.01 * system.get('S.I.').base('length')) * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("cm/s")
                    if unitname == "(0.001 * system.get('S.I.').base('mass'))":
                        self.units[column_name] = astropy.units.Unit("gram")

                    if unitname == "system.get('S.I.').base('length')":
                        self.units[column_name] = astropy.units.Unit("m")
                    if unitname == "(system.get('S.I.').base('length') * (system.get('S.I.').base('time')**-1))":
                        self.units[column_name] = astropy.units.Unit("m/s")
                    if unitname == "system.get('S.I.').base('mass')":
                        self.units[column_name] = astropy.units.Unit("kg")
                data = column if self._version == 1 else column['data']
                if hasattr(data, "dtype"):

                    #logger.debug("adding column %r with dtype %r", column_name, dtype)
                    # print column, column.shape
                    # offset = data.id.get_offset()
                    # if offset is None:
                    # raise Exception("columns doesn't really exist in hdf5 file")
                    shape = data.shape
                    if True:  # len(shape) == 1:
                        dtype = data.dtype
                        if "dtype" in data.attrs:
                            dtype = data.attrs["dtype"]
                        logger.debug("adding column %r with dtype %r",
                                     column_name, dtype)
                        # self.addColumn(column_name, offset, len(data), dtype=dtype)
                        if self._version > 1 and 'mask' in column:
                            self.add_column(
                                column_name,
                                self._map_hdf5_array(data, column['mask']))
                        else:
                            self.add_column(column_name,
                                            self._map_hdf5_array(data))
                            # mask = column['mask']
                            # offset = mask.id.get_offset()
                            # self.addColumn("temp_mask", offset, len(data), dtype=mask.dtype)
                            # mask_array = self.columns['temp_mask']
                            # del self.columns['temp_mask']
                            # self.column_names.remove('temp_mask')
                            # ar = self.columns[column_name] = np.ma.array(self.columns[column_name], mask=mask_array, shrink=False)
                            # assert ar.mask is mask_array, "masked array was copied"

                    else:

                        # transposed = self._length is None or shape[0] == self._length
                        transposed = shape[1] < shape[0]
                        self.addRank1(column_name,
                                      offset,
                                      shape[1],
                                      length1=shape[0],
                                      dtype=data.dtype,
                                      stride=1,
                                      stride1=1,
                                      transposed=transposed)
                        # if len(shape[0]) == self._length:
                        # self.addRank1(column_name, offset, shape[1], length1=shape[0], dtype=column.dtype, stride=1, stride1=1)
                        # self.addColumn(column_name+"_0", offset, shape[1], dtype=column.dtype)
                        # self.addColumn(column_name+"_last", offset+(shape[0]-1)*shape[1]*column.dtype.itemsize, shape[1], dtype=column.dtype)
                        # self.addRank1(name, offset+8*i, length=self.numberParticles+1, length1=self.numberTimes-1, dtype=np.float64, stride=stride, stride1=1, filename=filename_extra)
            # finished.add(column_name)
        all_columns = dict(**self.columns)
        # print(all_columns, column_order)
        self.column_names = []
        for name in column_order:
            self.columns[name] = all_columns.pop(name)
            self.column_names.append(name)
        # add the rest
        for name, col in all_columns.items():
            self.columns[name] = col
            self.column_names.append(name)