def _add_field_factory(field_type: str) -> Callable: func = fieldtypes.function(field_type) def _add_field(self, name, val=None, unit=None, write_level=None, suffix=None, **field_args): """Add a {field_type} field to the dataset""" if name in self._fields: raise exceptions.FieldExistsError( f"Field {name!r} already exists in dataset") # Create collections for nested fields collection, _, field_name = name.rpartition(".") if collection and collection not in self._fields: self.add_collection(collection) # Create field field = func(num_obs=self.num_obs, name=field_name, val=val, unit=unit, write_level=write_level, **field_args) # Add field to list of fields fields = getattr(self, collection) if collection else self._fields fields[field_name] = field _add_field.__doc__ = _add_field.__doc__.format(field_type=field_type) return _add_field
def read(cls, file_path: Union[str, pathlib.Path]) -> "Dataset": """Read a dataset from file""" log.debug(f"Read dataset from {file_path}") # Dictionary to keep track of references in the data structure # key: field_name, value: object (TimeArray, PositionArray, etc) memo = {} # Read fields from file with h5py.File(file_path, mode="r") as h5_file: num_obs = h5_file.attrs["num_obs"] dset = cls(num_obs=num_obs) dset.vars.update(_h5utils.decode_h5attr(h5_file.attrs["vars"])) # Read fields for fieldname, fieldtype in _h5utils.decode_h5attr( h5_file.attrs["fields"]).items(): field = fieldtypes.function(fieldtype).read( h5_file[fieldname], memo) dset._fields[fieldname] = field memo[fieldname] = field.data # Read meta dset.meta.read(h5_file["__meta__"]) return dset
def read(cls, h5_group, memo): name = h5_group.attrs["fieldname"] field = cls(num_obs=None, name=name, val=None) # num_obs and val not used fields = _h5utils.decode_h5attr(h5_group.attrs["fields"]) for fieldname, fieldtype in fields.items(): field.data._fields[fieldname] = fieldtypes.function(fieldtype).read(h5_group[fieldname], memo) return field
def difference(self, other, index_by=None, copy_self_on_error=False, copy_other_on_error=False): """Compute the difference between two datasets: self - other index_by fields will be copied from self to the difference dataset and excluded from the - operation Args: other: Dataset to substract from self index_by: Comma separated text string with name of fields (columns) that will be used to find common elements (rows). copy_self_on_error: Copy value of fields in self to the difference dataset if the - operation fails for a field copy_other_on_error: Copy value of fields in other to the difference dataset if the - operation fails for a field Returns: A new dataset with fields that contains the differene between fields in self and other """ if index_by is None: if len(self) != len(other): raise ValueError( f"Cannot compute difference between datasets with different number of observations ({self.num_obs} vs {other.num_obs})" ) num_obs = len(self) self_idx = np.ones(len(self), dtype=bool) other_idx = np.ones(len(other), dtype=bool) else: _index_by = index_by.split(",") self_index_data = [self[n.strip()] for n in _index_by] other_index_data = [other[n.strip()] for n in _index_by] A = np.rec.fromarrays(self_index_data) B = np.rec.fromarrays(other_index_data) common, self_idx, other_idx = np.intersect1d(A, B, return_indices=True) num_obs = len(common) if num_obs == 0: raise ValueError( f"Nothing to differentiate. No common data found for chosen option index_by '{index_by}'." ) result = self._difference( other, num_obs, self_idx, other_idx, copy_self_on_error=copy_self_on_error, copy_other_on_error=copy_other_on_error, ) # Overwrite field index_by difference with original value if index_by is not None: _index_by = index_by.split(",") for index_field in _index_by: index_field = index_field.strip() try: del result[index_field] except AttributeError: # Field does not exists so no need to delete pass index_data = self[index_field][self_idx] fieldtype = fieldtypes.fieldtype(index_data) func = fieldtypes.function(fieldtype) field = func( num_obs=num_obs, name=index_field, val=index_data, unit=self.field(index_field)._unit, write_level=self.field(index_field)._write_level.name, ) result._fields[index_field] = field return result
def _difference(self, other, num_obs, self_idx, other_idx, copy_self_on_error=False, copy_other_on_error=False): """Perform the - operation for each field in self and other""" result = self.__class__() for fieldname, field in self._fields.items(): if fieldname in other._fields: try: factors = [Unit(_from, _to) for _to, _from in zip(field._unit, other._fields[fieldname]._unit)] except TypeError: factors = None except exceptions.UnitError as err: raise ValueError(f"Cannot compute difference for field `{fieldname}`: {err}") try: if factors: difference = self[fieldname][self_idx] - other[fieldname][other_idx] * np.array(factors) else: difference = self[fieldname][self_idx] - other[fieldname][other_idx] fieldtype = fieldtypes.fieldtype(difference) func = fieldtypes.function(fieldtype) field = func( num_obs=num_obs, name=fieldname, val=difference, unit=field._unit, write_level=field._write_level.name, ) result.add_field(fieldname, field) except IndexError as err: # fieldname is a collection collection = self[fieldname]._difference( other[fieldname], num_obs, self_idx, other_idx, copy_self_on_error=copy_self_on_error, copy_other_on_error=copy_other_on_error, ) fieldtype = fieldtypes.fieldtype(collection) func = fieldtypes.function(fieldtype) field = func( num_obs=num_obs, name=fieldname, val=collection, unit=field._unit, write_level=field._write_level.name, ) result.add_field(fieldname, field) except TypeError as err: # Fields that do not support the - operator if copy_self_on_error: index_data = self[fieldname][self_idx] fieldtype = fieldtypes.fieldtype(index_data) func = fieldtypes.function(fieldtype) self_fieldname = f"{fieldname}_self" field = func( num_obs=num_obs, name=self_fieldname, val=index_data, unit=field._unit, write_level=field._write_level.name, ) result.add_field(self_fieldname, field) if copy_other_on_error: index_data = other[fieldname][other_idx] fieldtype = fieldtypes.fieldtype(index_data) func = fieldtypes.function(fieldtype) other_fieldname = f"{fieldname}_other" field = func( num_obs=num_obs, name=other_fieldname, val=index_data, unit=other._fields[fieldname]._unit, write_level=other._fields[fieldname]._write_level.name, ) result.add_field(other_fieldname, field) return result