def validate(self): """Checks if all field values are lists/arrays of the same length.""" not_arr_error_mess = "Data-chunk field values must be numpy arrays " \ "or lists, while '%s' field contains: '%s'." not_same_len_error_mess = "All data-chunk field value arrays/lists " \ "must be of the same size." prev_len = None for k, v in self.items(): if not isinstance(v, (list, np.ndarray)): raise DataChunkError(not_arr_error_mess % (k, type(v).__name__)) curr_len = len(v) if prev_len is not None and prev_len != curr_len: raise DataChunkError(not_same_len_error_mess) prev_len = curr_len
def iter(self): """Creates a data-units generator.""" if not self._is_valid(): raise DataChunkError("Can't iterate over an invalid data-chunk.") for indx in range(len(self)): data_unit = self[indx] yield data_unit
def absorb_and_yield_if_full(self, data_chunk): for indx in range(len(data_chunk)): group_id = data_chunk[indx, self.id_fname] if self._prev_group_id and group_id != self._prev_group_id: for chunk in self.yield_remaining(): yield chunk self.reset() self._prev_group_id = group_id if not len(self._coll): for fn in data_chunk.fnames: if isinstance(data_chunk[fn], np.ndarray): self._coll[fn] = np.array([], dtype=data_chunk[fn].dtype) else: self._coll[fn] = [] for fn in data_chunk.fnames: val = data_chunk[indx, fn] if fn not in self._coll: raise DataChunkError("Input chunks have different field " "names.") if isinstance(self._coll[fn], np.ndarray): self._coll[fn] = np.append(self._coll[fn], val) else: self._coll[fn].append(val)
def append(self, data_unit): """ Appends a new data-unit to the end of a valid data-chunk. :param data_unit: data unit or dict with field name and value pairs. """ allowed_types = (dict, OrderedDict, DataUnit) rpr = [at.__name__ for at in allowed_types] if not isinstance(data_unit, allowed_types): raise TypeError("'data_unit' must be %s." % " or ".join(rpr)) if not self._is_valid(): raise DataChunkError("Can't append a new data-unit to an " "invalid data-chunk.") if len(self.fnames) > 0: for k in data_unit: if k not in self: raise ValueError( "Please provide all keys matching existing " "field names.") else: for k in data_unit: if isinstance(data_unit, DataUnit): ds_type = data_unit.ds_type(k) if ds_type == list: ds = [] elif ds_type == np.ndarray: ds = np.array([]) else: raise TypeError("Can't handle '%s' type." % ds_type) else: ds = [] self[k] = ds for k in data_unit: if isinstance(self[k], np.ndarray): self[k] = np.append(self[k], data_unit[k]) elif isinstance(self[k], list): self[k].append(data_unit[k]) else: raise NotImplementedError
def _valid(dc): if not isinstance(dc, DataChunk): raise DataChunkError("The data-chunk is an invalid object.") data_chunk.validate()