def blosc_args(dt): if np.issubdtype(dt, int): return bloscpack.BloscArgs(dt.itemsize, clevel=3, shuffle=True) if np.issubdtype(dt, np.datetime64): return bloscpack.BloscArgs(dt.itemsize, clevel=3, shuffle=True) if np.issubdtype(dt, float): return bloscpack.BloscArgs(dt.itemsize, clevel=1, shuffle=False) return None
def save_object_to_disk( obj, file_path: Path, compression_method: str = None, # "gzip" / "blosc" obj_name: str = None, element_size_estimate_mb: float = None, ) -> bool: """ Save a pickle-serialized and optionally gzip/blosc-compressed object to disk, if estimated size is within the limits. Returns 'True' if saved, 'False' otherwise. """ # create parent directory if needed dir_path = file_path.parent Path.mkdir(dir_path, parents=True, exist_ok=True) # split iterables to chunks if possible if element_size_estimate_mb is not None: MAX_CHUNK_MB = 2000 # should be about optimized chunk_size = max(int(MAX_CHUNK_MB / element_size_estimate_mb), 1) chunked_obj = list(chunks(obj, chunk_size)) else: # obj isn't iterable - treat as a single chunk chunked_obj = [obj] if compression_method == "gzip": with gzip.open(file_path, "wb", compresslevel=1) as f_gzip: for chunk_ in chunked_obj: pickle.dump(chunk_, f_gzip, protocol=-1) # type: ignore elif compression_method == "blosc": blosc_args = bloscpack.BloscArgs(typesize=4, clevel=1, cname="zlib") with open(file_path, "wb") as f_blosc: for chunk_ in chunked_obj: p_chunk = pickle.dumps(chunk_, protocol=-1) cmprsd_chunk = bloscpack.pack_bytes_to_bytes( p_chunk, blosc_args=blosc_args) pickle.dump(cmprsd_chunk, f_blosc, protocol=-1) else: # save uncompressed with open(file_path, "wb") as f: for chunk_ in chunked_obj: pickle.dump(chunk_, f, protocol=-1) logging.debug( f"Object '{obj_name}' of class '{obj.__class__.__name__}' ({compression_method}-compressed) saved as: {file_path}" ) return True