def test_dataset_buffer__write_value(): filename = os.path.join(tempfile.gettempdir(), "store.h5") try: with h5py.File(filename, "w") as store: columns = ("1", "2", "3", "4") max_size = 5000 dataset = DatasetBuffer(store, "data", max_size, float, columns, max_chunk_bytes=128 * 1024) assert dataset.chunk_count == 4096 for i in range(max_size): data = np.ones(4) dataset.write_value(data) assert dataset._buf_index == max_size - dataset.chunk_count dataset.flush_data() assert dataset._buf_index == 0 with h5py.File(filename, "r") as store: data = store["data"][:] assert len(data) == max_size actual_columns = DatasetBuffer.get_columns(store["data"]) assert [x for x in actual_columns] == list(columns) for i in range(max_size): for j in range(4): assert data[i][j] == 1.0 df = DatasetBuffer.to_dataframe(store["data"]) assert isinstance(df, pd.DataFrame) assert len(df) == max_size assert df.iloc[0, 0] == 1.0 finally: if os.path.exists(filename): os.remove(filename)
def _get_filtered_dataframe(self, elem_class, prop, name, dataset, real_only=False, abs_val=False, **kwargs): indices_df = self._get_indices_df() elem_index = self._elem_indices_by_prop[elem_class][prop][name] length = dataset.attrs["length"] data_vals = dataset[:length] # The time_step_dataset has these columns: # 1. time step index # 2. element index # Each row describes the source data in the dataset row. path = dataset.attrs["time_step_path"] time_step_data = self._hdf_store[path][:length] assert length == self._hdf_store[path].attrs["length"] data = [] timestamps = [] for i in range(length): stored_elem_index = time_step_data[:, 1][i] if stored_elem_index == elem_index: ts_index = time_step_data[:, 0][i] # TODO DT: more than one column? val = data_vals[i, 0] # TODO: profile this vs a df operation at end if real_only: val = val.real elif abs_val: val = abs(val) data.append(val) timestamps.append(indices_df.iloc[ts_index, 0]) columns = self._fix_columns(name, DatasetBuffer.get_columns(dataset)) return pd.DataFrame(data, columns=columns, index=timestamps)
def get_filtered_dataframes(self, element_class, prop, real_only=False, abs_val=False): """Return the dataframes for all elements. Calling this is much more efficient than calling get_dataframe for each element. Parameters ---------- element_class : str prop : str element_name : str real_only : bool If dtype of any column is complex, drop the imaginary component. abs_val : bool If dtype of any column is complex, compute its absolute value. Returns ------- dict key = str (name), val = pd.DataFrame The dict will be empty if no data was stored. """ if prop not in self.list_element_properties(element_class): logger.debug("%s/%s is not stored", element_class, prop) return {} dataset = self._group[f"{element_class}/ElementProperties/{prop}"] columns = DatasetBuffer.get_columns(dataset) names = DatasetBuffer.get_names(dataset) length = dataset.attrs["length"] indices_df = self._get_indices_df() data_vals = dataset[:length] elem_data = defaultdict(list) elem_timestamps = defaultdict(list) # The time_step_dataset has these columns: # 1. time step index # 2. element index # Each row describes the source data in the dataset row. path = dataset.attrs["time_step_path"] assert length == self._hdf_store[path].attrs["length"] time_step_data = self._hdf_store[path][:length] for i in range(length): ts_index = time_step_data[:, 0][i] elem_index = time_step_data[:, 1][i] # TODO DT: more than one column? val = data_vals[i, 0] if real_only: val = val.real elif abs_val: val = abs(val) elem_data[elem_index].append(val) elem_timestamps[elem_index].append(indices_df.iloc[ts_index, 0]) dfs = {} for elem_index, vals in elem_data.items(): elem_name = names[elem_index] cols = self._fix_columns(elem_name, columns) dfs[elem_name] = pd.DataFrame( vals, columns=cols, index=elem_timestamps[elem_index], ) return dfs