def _open_cache(self, location): if self.overwrite: self.cache = zarr.open(location, mode='w', shape=(self.cache_size,), chunks=(1,), dtype=object, object_codec=numcodecs.Pickle(), synchronizer=zarr.ThreadSynchronizer()) else: if os.path.exists(location): self.cache = zarr.open(location, mode='r', object_codec=numcodecs.Pickle(), synchronizer=zarr.ThreadSynchronizer())
def __init__( self,max_strokes,max_stroke_length, batch_size,max_per_class=1000, root_dir=os.environ["QUICKDRAW_DATA_ROOT"], arr_dir="processed_data",transform=None): self.arr_dir = arr_dir self.root_dir = root_dir self.max_strokes = max_strokes self.max_stroke_length = max_stroke_length self.max_per_class = max_per_class self.batch_size = batch_size self.transform = transform self.zarr_kwargs = dict( compressor=Blosc(), chunks=(512,), dtype=object, object_codec=numcodecs.Pickle() ) if not os.path.exists(self.get_arr_dir()): self.preprocess(root_dir) self.drawings, self.classes = ( zarr.open(self.get_arr_path("drawings"),"r"), zarr.open(self.get_arr_path("classes"),"r")[:] ) with open(self.get_json_path()) as f: d = json.load(f) self.class2label, self.country2label = d["class2label"], d["country2label"]
def create_array(ds_group, column, column_schema, schema_chunks, coordinate=False): codec = numcodecs.Pickle() if column_schema.dtype == object else None if column_schema.chunks is None: try: # No column chunking found, probably an ndarray, # derive column chunking from chunks on dataset chunks = tuple(schema_chunks[d] for d in column_schema.dims) except KeyError: # Nope, just set chunks equal to dimension size chunks = tuple((s, ) for s in column_schema.shape) else: chunks = column_schema.chunks zchunks = zarr_chunks(column, column_schema.dims, chunks) array = ds_group.require_dataset(column, column_schema.shape, chunks=zchunks, dtype=column_schema.dtype, object_codec=codec, exact=True) array.attrs[DASKMS_ATTR_KEY] = { "dims": column_schema.dims, "coordinate": coordinate, "array_type": encode_type(column_schema.type), }
def create_array(ds_group, column, schema, coordinate=False): codec = numcodecs.Pickle() if schema.dtype == np.object else None zchunks = zarr_chunks(column, schema.dims, schema.chunks) array = ds_group.require_dataset(column, schema.shape, chunks=zchunks, dtype=schema.dtype, object_codec=codec, exact=True) if zchunks is not None: # Expand zarr chunks to full dask resolution # For comparison purposes zchunks = normalize_chunks(array.chunks, schema.shape) if zchunks != schema.chunks: raise ValueError( f"zarr chunks {zchunks} " f"don't match dask chunks {schema.chunks}. " f"This can cause data corruption as described in " f"https://zarr.readthedocs.io/en/stable/tutorial.html" f"#parallel-computing-and-synchronization") array.attrs[DASKMS_ATTR_KEY] = { "dims": schema.dims, "coordinate": coordinate, "array_type": encode_type(schema.type), }
def append(self, key, obj): if isinstance(obj, np.ndarray) or isinstance(obj, float) or isinstance( obj, int) or isinstance(obj, np.generic): if isinstance(obj, float) or isinstance(obj, int) or isinstance( obj, np.generic): obj = np.array(obj) d = self.f.get(key) if d is not None: assert isinstance(d, self.zarr.core.Array) # https://stackoverflow.com/a/25656175 d.resize(d.shape[0] + 1, *d.shape[1:]) d[-1, ...] = obj if self.datastore_type == DatastoreType.LMDB: self.store.flush() else: self.f.create_dataset(key, data=obj[None, ...], compressor=self.compressor, chunks=self._get_chunk_size(obj)) else: import numcodecs # self.f.create_dataset("{}/{}".format(key, self.i), data=obj) z = self.f.array("{}/{}".format(key, self.i), obj, dtype=object, object_codec=numcodecs.Pickle()) self.i += 1
def _setup_new_zarr_store( self, zdim, sim_shapes, root, chunksize=1000) -> None: # Adding observational shapes to store # Parameters root.zeros( self._filesystem.pars, shape=(0, zdim), chunks=(chunksize, zdim), dtype="f8", ) # Simulations sims = root.create_group(self._filesystem.sims) for name, shape in sim_shapes.items(): sims.zeros(name, shape=(0, *shape), chunks=(chunksize, *shape), dtype="f8") # Random intensity weights root.zeros( self._filesystem.log_w, shape=(0, ), chunks=(chunksize, ), dtype="f8", ) # Pickled Intensity (prior * N) objects root.create( self._filesystem.log_lambdas, shape=(0, ), dtype=object, object_codec=numcodecs.Pickle(), ) # Simulation status code root.zeros( self._filesystem.simulation_status, shape=(0, ), chunks=(chunksize, ), dtype="int", )
def __init__( self, fs_map: str, mode: str = "r", shape=None, max_shape=None, dtype="float64", chunks=None, compressor=DEFAULT_COMPRESSOR, ): """Constructor Parameters ---------- fs_map : MutableMap Maps filesystem to MutableMap mode : str Mode in which tensor is opened (default is "r"), can be used to overwrite or append shape : Tuple[int | None] Shape of tensor, (must be specified) can contains Nones meaning the shape might change max_shape: Tuple[int | None] Maximum possible shape of the tensor (must be specified) dtype : str Numpy analog dtype for this tensor chunks : Tuple[int] | True How to split the tensor into chunks (files) (default is True) If chunks=True then chunksize will automatically be detected """ if not (shape is None): # otherwise shape detector fails shapeDt = ShapeDetector( shape, max_shape, chunks, dtype, compressor=compressor ) shape = shapeDt.shape max_shape = shapeDt.max_shape chunks = shapeDt.chunks elif "r" not in mode: raise TypeError("shape cannot be none") self.fs_map = fs_map exist_ = fs_map.get(".hub.dynamic_tensor") # if not exist_ and len(fs_map) > 0 and "w" in mode: # raise OverwriteIsNotSafeException() exist = False if "w" in mode else exist_ is not None if "r" in mode and not exist: raise DynamicTensorNotFoundException() synchronizer = None # synchronizer = zarr.ThreadSynchronizer() # synchronizer = zarr.ProcessSynchronizer("~/activeloop/sync/example.sync") # if tensor exists and mode is read or append if ("r" in mode or "a" in mode) and exist: meta = json.loads(fs_map.get(".hub.dynamic_tensor").decode("utf-8")) shape = meta["shape"] self._dynamic_dims = get_dynamic_dims(shape) self._storage_tensor = zarr.open_array( store=fs_map, mode=mode, synchronizer=synchronizer ) self._dynamic_tensor = ( zarr.open_array( NestedStore(fs_map, "--dynamic--"), mode=mode, synchronizer=synchronizer, ) if self._dynamic_dims else None ) # else we need to create or overwrite the tensor else: self._dynamic_dims = get_dynamic_dims(shape) self._storage_tensor = zarr.zeros( max_shape, dtype=dtype, chunks=chunks, store=fs_map, overwrite=("w" in mode), object_codec=numcodecs.Pickle(protocol=3) if str(dtype) == "object" else None, compressor=compressor, synchronizer=synchronizer, ) self._dynamic_tensor = ( zarr.zeros( shape=(max_shape[0], len(self._dynamic_dims)), mode=mode, dtype=np.int32, store=NestedStore(fs_map, "--dynamic--"), synchronizer=synchronizer, compressor=None, ) if self._dynamic_dims else None ) fs_map[".hub.dynamic_tensor"] = bytes(json.dumps({"shape": shape}), "utf-8") self.shape = shape self.max_shape = self._storage_tensor.shape self.chunks = self._storage_tensor.chunks self.dtype = self._storage_tensor.dtype if len(self.shape) != len(self.max_shape): raise DynamicTensorShapeException("length") for item in self.max_shape: if item is None: raise DynamicTensorShapeException("none") for item in zip(self.shape, self.max_shape): if item[0] is not None: if item[0] != item[1]: raise DynamicTensorShapeException("not_equal") self._enabled_dynamicness = True
def writeFactorData(self, factor_data, table_name, ifactor_name, if_exists="update", data_type=None): TablePath = self.MainDir + os.sep + table_name with self._DataLock: ZTable = zarr.open(TablePath, mode="a") if ifactor_name not in ZTable: factor_data, data_type = _identifyDataType( factor_data, data_type) ZFactor = ZTable.create_group(ifactor_name, overwrite=True) ZFactor.create_dataset("ID", shape=(factor_data.shape[1], ), data=factor_data.columns.values, dtype=object, object_codec=numcodecs.VLenUTF8(), overwrite=True) ZFactor.create_dataset("DateTime", shape=(factor_data.shape[0], ), data=factor_data.index.values, dtype="M8[ns]", overwrite=True) if data_type == "double": ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype="f8", fill_value=np.nan, overwrite=True) elif data_type == "string": ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype=object, object_codec=numcodecs.VLenUTF8(), overwrite=True) elif data_type == "object": ZFactor.create_dataset("Data", shape=factor_data.shape, data=factor_data.values, dtype=object, object_codec=numcodecs.Pickle(), overwrite=True) ZFactor.attrs["DataType"] = data_type DataType = ZTable.attrs.get("DataType", {}) DataType[ifactor_name] = data_type ZTable.attrs["DataType"] = DataType return 0 if if_exists == "update": self._updateFactorData(factor_data, table_name, ifactor_name, data_type) elif if_exists == "append": OldData = self.getTable(table_name).readFactorData( ifactor_name=ifactor_name, ids=factor_data.columns.tolist(), dts=factor_data.index.tolist()) OldData.index = factor_data.index factor_data = OldData.where(pd.notnull(OldData), factor_data) self._updateFactorData(factor_data, table_name, ifactor_name, data_type) return 0
def run(self): """Main loop of the data collection worker process. Will receive brokered requests from the frontend, process them, and respond with the result through the broker. """ context = zmq.Context() socket = context.socket(zmq.REP) socket.setsockopt(zmq.RCVTIMEO, default_poll_delay_ms) socket.connect(self.backend_address) self.time_init.value = time.time() self.time_counter.value = 0.0 self.packet_counter.value = 0 self.running_flag.value = 1 print( f"creating zarr collection file at: {self.data_output_path}, " f"but ignoring compression flag {self.compression}", flush=True) fd = zarr.open(self.data_output_path, "w") try: fd.attrs[ "git_hash"] = covid19sim.utils.utils.get_git_revision_hash() except subprocess.CalledProcessError: fd.attrs["git_hash"] = "NO_GIT" fd.attrs["creation_date"] = datetime.datetime.now().isoformat() fd.attrs["creator"] = str(platform.node()) config_backup = json.dumps(covid19sim.utils.utils.dumps_conf(self.config_backup)) \ if self.config_backup else None fd.attrs["config"] = config_backup dataset = fd.create_dataset( "dataset", shape=( self.simulation_days, 24, self.human_count, ), chunks=(1, None, None), # 1 x 6 x human_count dtype=object, object_codec=numcodecs.Pickle(), ) is_filled = fd.create_dataset("is_filled", shape=( self.simulation_days, 24, self.human_count, ), dtype=bool, fillvalue=False) total_dataset_bytes = 0 sample_idx = 0 current_day = 0 dataset_cache_factory = lambda: np.zeros(shape=(24, self.human_count), dtype=object) is_filled_cache_factory = lambda: np.zeros( shape=(24, self.human_count), dtype=bool) dataset_cache = dataset_cache_factory() is_filled_cache = is_filled_cache_factory() while not self.stop_flag.is_set(): if self.reset_flag.is_set(): self.time_counter.value = 0.0 self.packet_counter.value = 0 self.time_init.value = 0.0 self.reset_flag.clear() try: buffer = socket.recv() except zmq.error.Again: continue proc_start_time = time.time() day_idx, hour_idx, human_idx, buffer = pickle.loads(buffer) total_dataset_bytes += len(buffer) if day_idx == (current_day + 1): # It's a new day # Dump the cache dataset[current_day, :, :] = dataset_cache is_filled[current_day, :, :] = is_filled_cache # Make a new cache dataset_cache = dataset_cache_factory() is_filled_cache = is_filled_cache_factory() # Update the current_day counter current_day += 1 elif day_idx == current_day: pass else: raise RuntimeError( f"The worker was at day {current_day}, but got a " f"message from day {day_idx}. Bonk!") # Write the pickle and is_filled to cache dataset_cache[hour_idx, human_idx] = pickle.loads(buffer) is_filled_cache[hour_idx, human_idx] = True # Note to future self: this is what it used to be: # dataset[day_idx, hour_idx, human_idx] = pickle.loads(buffer) # is_filled[day_idx, hour_idx, human_idx] = True socket.send(str(sample_idx).encode()) sample_idx += 1 with self.time_counter.get_lock(): self.time_counter.value += time.time() - proc_start_time with self.packet_counter.get_lock(): self.packet_counter.value += 1 self.running_flag.value = 0 socket.close() dataset.attrs["total_samples"] = sample_idx dataset.attrs["total_bytes"] = total_dataset_bytes