Example #1
0
 def _open_cache(self, location):
     if self.overwrite:
         self.cache = zarr.open(location, mode='w', shape=(self.cache_size,),
                                chunks=(1,), dtype=object,
                                object_codec=numcodecs.Pickle(),
                                synchronizer=zarr.ThreadSynchronizer())
     else:
         if os.path.exists(location):
             self.cache = zarr.open(location, mode='r',
                                    object_codec=numcodecs.Pickle(),
                                    synchronizer=zarr.ThreadSynchronizer())
Example #2
0
    def __init__(
                self,max_strokes,max_stroke_length,
                batch_size,max_per_class=1000,
                root_dir=os.environ["QUICKDRAW_DATA_ROOT"],
                arr_dir="processed_data",transform=None):
        self.arr_dir = arr_dir
        self.root_dir = root_dir
        self.max_strokes = max_strokes
        self.max_stroke_length = max_stroke_length
        self.max_per_class = max_per_class
        self.batch_size = batch_size
        self.transform = transform

        self.zarr_kwargs = dict(
            compressor=Blosc(),
            chunks=(512,),
            dtype=object,
            object_codec=numcodecs.Pickle()
            )

        if not os.path.exists(self.get_arr_dir()):
            self.preprocess(root_dir)
       
        self.drawings, self.classes = (
            zarr.open(self.get_arr_path("drawings"),"r"),
            zarr.open(self.get_arr_path("classes"),"r")[:]
            )

        with open(self.get_json_path()) as f:
            d = json.load(f)
            self.class2label, self.country2label = d["class2label"], d["country2label"]
Example #3
0
def create_array(ds_group,
                 column,
                 column_schema,
                 schema_chunks,
                 coordinate=False):

    codec = numcodecs.Pickle() if column_schema.dtype == object else None

    if column_schema.chunks is None:
        try:
            # No column chunking found, probably an ndarray,
            # derive column chunking from chunks on dataset
            chunks = tuple(schema_chunks[d] for d in column_schema.dims)
        except KeyError:
            # Nope, just set chunks equal to dimension size
            chunks = tuple((s, ) for s in column_schema.shape)
    else:
        chunks = column_schema.chunks

    zchunks = zarr_chunks(column, column_schema.dims, chunks)

    array = ds_group.require_dataset(column,
                                     column_schema.shape,
                                     chunks=zchunks,
                                     dtype=column_schema.dtype,
                                     object_codec=codec,
                                     exact=True)

    array.attrs[DASKMS_ATTR_KEY] = {
        "dims": column_schema.dims,
        "coordinate": coordinate,
        "array_type": encode_type(column_schema.type),
    }
Example #4
0
def create_array(ds_group, column, schema, coordinate=False):
    codec = numcodecs.Pickle() if schema.dtype == np.object else None

    zchunks = zarr_chunks(column, schema.dims, schema.chunks)

    array = ds_group.require_dataset(column,
                                     schema.shape,
                                     chunks=zchunks,
                                     dtype=schema.dtype,
                                     object_codec=codec,
                                     exact=True)

    if zchunks is not None:
        # Expand zarr chunks to full dask resolution
        # For comparison purposes
        zchunks = normalize_chunks(array.chunks, schema.shape)

        if zchunks != schema.chunks:
            raise ValueError(
                f"zarr chunks {zchunks} "
                f"don't match dask chunks {schema.chunks}. "
                f"This can cause data corruption as described in "
                f"https://zarr.readthedocs.io/en/stable/tutorial.html"
                f"#parallel-computing-and-synchronization")

    array.attrs[DASKMS_ATTR_KEY] = {
        "dims": schema.dims,
        "coordinate": coordinate,
        "array_type": encode_type(schema.type),
    }
Example #5
0
    def append(self, key, obj):
        if isinstance(obj, np.ndarray) or isinstance(obj, float) or isinstance(
                obj, int) or isinstance(obj, np.generic):
            if isinstance(obj, float) or isinstance(obj, int) or isinstance(
                    obj, np.generic):
                obj = np.array(obj)
            d = self.f.get(key)
            if d is not None:
                assert isinstance(d, self.zarr.core.Array)
                # https://stackoverflow.com/a/25656175
                d.resize(d.shape[0] + 1, *d.shape[1:])
                d[-1, ...] = obj
                if self.datastore_type == DatastoreType.LMDB:
                    self.store.flush()
            else:
                self.f.create_dataset(key,
                                      data=obj[None, ...],
                                      compressor=self.compressor,
                                      chunks=self._get_chunk_size(obj))
        else:
            import numcodecs

            # self.f.create_dataset("{}/{}".format(key, self.i), data=obj)
            z = self.f.array("{}/{}".format(key, self.i),
                             obj,
                             dtype=object,
                             object_codec=numcodecs.Pickle())
            self.i += 1
Example #6
0
    def _setup_new_zarr_store(
            self,
            zdim,
            sim_shapes,
            root,
            chunksize=1000) -> None:  # Adding observational shapes to store
        # Parameters
        root.zeros(
            self._filesystem.pars,
            shape=(0, zdim),
            chunks=(chunksize, zdim),
            dtype="f8",
        )

        # Simulations
        sims = root.create_group(self._filesystem.sims)
        for name, shape in sim_shapes.items():
            sims.zeros(name,
                       shape=(0, *shape),
                       chunks=(chunksize, *shape),
                       dtype="f8")

        # Random intensity weights
        root.zeros(
            self._filesystem.log_w,
            shape=(0, ),
            chunks=(chunksize, ),
            dtype="f8",
        )

        # Pickled Intensity (prior * N) objects
        root.create(
            self._filesystem.log_lambdas,
            shape=(0, ),
            dtype=object,
            object_codec=numcodecs.Pickle(),
        )

        # Simulation status code
        root.zeros(
            self._filesystem.simulation_status,
            shape=(0, ),
            chunks=(chunksize, ),
            dtype="int",
        )
Example #7
0
    def __init__(
        self,
        fs_map: str,
        mode: str = "r",
        shape=None,
        max_shape=None,
        dtype="float64",
        chunks=None,
        compressor=DEFAULT_COMPRESSOR,
    ):
        """Constructor
        Parameters
        ----------
        fs_map : MutableMap
            Maps filesystem to MutableMap
        mode : str
            Mode in which tensor is opened (default is "r"), can be used to overwrite or append
        shape : Tuple[int | None]
            Shape of tensor, (must be specified) can contains Nones meaning the shape might change
        max_shape: Tuple[int | None]
            Maximum possible shape of the tensor (must be specified)
        dtype : str
            Numpy analog dtype for this tensor
        chunks : Tuple[int] | True
            How to split the tensor into chunks (files) (default is True)
            If chunks=True then chunksize will automatically be detected

        """
        if not (shape is None):
            # otherwise shape detector fails
            shapeDt = ShapeDetector(
                shape, max_shape, chunks, dtype, compressor=compressor
            )
            shape = shapeDt.shape
            max_shape = shapeDt.max_shape
            chunks = shapeDt.chunks
        elif "r" not in mode:
            raise TypeError("shape cannot be none")

        self.fs_map = fs_map
        exist_ = fs_map.get(".hub.dynamic_tensor")

        # if not exist_ and len(fs_map) > 0 and "w" in mode:
        #     raise OverwriteIsNotSafeException()
        exist = False if "w" in mode else exist_ is not None
        if "r" in mode and not exist:
            raise DynamicTensorNotFoundException()

        synchronizer = None
        # synchronizer = zarr.ThreadSynchronizer()
        # synchronizer = zarr.ProcessSynchronizer("~/activeloop/sync/example.sync")
        # if tensor exists and mode is read or append

        if ("r" in mode or "a" in mode) and exist:
            meta = json.loads(fs_map.get(".hub.dynamic_tensor").decode("utf-8"))
            shape = meta["shape"]
            self._dynamic_dims = get_dynamic_dims(shape)
            self._storage_tensor = zarr.open_array(
                store=fs_map, mode=mode, synchronizer=synchronizer
            )
            self._dynamic_tensor = (
                zarr.open_array(
                    NestedStore(fs_map, "--dynamic--"),
                    mode=mode,
                    synchronizer=synchronizer,
                )
                if self._dynamic_dims
                else None
            )
        # else we need to create or overwrite the tensor
        else:
            self._dynamic_dims = get_dynamic_dims(shape)
            self._storage_tensor = zarr.zeros(
                max_shape,
                dtype=dtype,
                chunks=chunks,
                store=fs_map,
                overwrite=("w" in mode),
                object_codec=numcodecs.Pickle(protocol=3)
                if str(dtype) == "object"
                else None,
                compressor=compressor,
                synchronizer=synchronizer,
            )
            self._dynamic_tensor = (
                zarr.zeros(
                    shape=(max_shape[0], len(self._dynamic_dims)),
                    mode=mode,
                    dtype=np.int32,
                    store=NestedStore(fs_map, "--dynamic--"),
                    synchronizer=synchronizer,
                    compressor=None,
                )
                if self._dynamic_dims
                else None
            )

            fs_map[".hub.dynamic_tensor"] = bytes(json.dumps({"shape": shape}), "utf-8")

        self.shape = shape
        self.max_shape = self._storage_tensor.shape
        self.chunks = self._storage_tensor.chunks
        self.dtype = self._storage_tensor.dtype

        if len(self.shape) != len(self.max_shape):
            raise DynamicTensorShapeException("length")
        for item in self.max_shape:
            if item is None:
                raise DynamicTensorShapeException("none")
        for item in zip(self.shape, self.max_shape):
            if item[0] is not None:
                if item[0] != item[1]:
                    raise DynamicTensorShapeException("not_equal")
        self._enabled_dynamicness = True
Example #8
0
 def writeFactorData(self,
                     factor_data,
                     table_name,
                     ifactor_name,
                     if_exists="update",
                     data_type=None):
     TablePath = self.MainDir + os.sep + table_name
     with self._DataLock:
         ZTable = zarr.open(TablePath, mode="a")
         if ifactor_name not in ZTable:
             factor_data, data_type = _identifyDataType(
                 factor_data, data_type)
             ZFactor = ZTable.create_group(ifactor_name, overwrite=True)
             ZFactor.create_dataset("ID",
                                    shape=(factor_data.shape[1], ),
                                    data=factor_data.columns.values,
                                    dtype=object,
                                    object_codec=numcodecs.VLenUTF8(),
                                    overwrite=True)
             ZFactor.create_dataset("DateTime",
                                    shape=(factor_data.shape[0], ),
                                    data=factor_data.index.values,
                                    dtype="M8[ns]",
                                    overwrite=True)
             if data_type == "double":
                 ZFactor.create_dataset("Data",
                                        shape=factor_data.shape,
                                        data=factor_data.values,
                                        dtype="f8",
                                        fill_value=np.nan,
                                        overwrite=True)
             elif data_type == "string":
                 ZFactor.create_dataset("Data",
                                        shape=factor_data.shape,
                                        data=factor_data.values,
                                        dtype=object,
                                        object_codec=numcodecs.VLenUTF8(),
                                        overwrite=True)
             elif data_type == "object":
                 ZFactor.create_dataset("Data",
                                        shape=factor_data.shape,
                                        data=factor_data.values,
                                        dtype=object,
                                        object_codec=numcodecs.Pickle(),
                                        overwrite=True)
             ZFactor.attrs["DataType"] = data_type
             DataType = ZTable.attrs.get("DataType", {})
             DataType[ifactor_name] = data_type
             ZTable.attrs["DataType"] = DataType
             return 0
     if if_exists == "update":
         self._updateFactorData(factor_data, table_name, ifactor_name,
                                data_type)
     elif if_exists == "append":
         OldData = self.getTable(table_name).readFactorData(
             ifactor_name=ifactor_name,
             ids=factor_data.columns.tolist(),
             dts=factor_data.index.tolist())
         OldData.index = factor_data.index
         factor_data = OldData.where(pd.notnull(OldData), factor_data)
         self._updateFactorData(factor_data, table_name, ifactor_name,
                                data_type)
     return 0
Example #9
0
    def run(self):
        """Main loop of the data collection worker process.

        Will receive brokered requests from the frontend, process them, and respond
        with the result through the broker.
        """
        context = zmq.Context()
        socket = context.socket(zmq.REP)
        socket.setsockopt(zmq.RCVTIMEO, default_poll_delay_ms)
        socket.connect(self.backend_address)
        self.time_init.value = time.time()
        self.time_counter.value = 0.0
        self.packet_counter.value = 0
        self.running_flag.value = 1
        print(
            f"creating zarr collection file at: {self.data_output_path}, "
            f"but ignoring compression flag {self.compression}",
            flush=True)
        fd = zarr.open(self.data_output_path, "w")
        try:
            fd.attrs[
                "git_hash"] = covid19sim.utils.utils.get_git_revision_hash()
        except subprocess.CalledProcessError:
            fd.attrs["git_hash"] = "NO_GIT"
        fd.attrs["creation_date"] = datetime.datetime.now().isoformat()
        fd.attrs["creator"] = str(platform.node())
        config_backup = json.dumps(covid19sim.utils.utils.dumps_conf(self.config_backup)) \
            if self.config_backup else None
        fd.attrs["config"] = config_backup
        dataset = fd.create_dataset(
            "dataset",
            shape=(
                self.simulation_days,
                24,
                self.human_count,
            ),
            chunks=(1, None, None),  # 1 x 6 x human_count
            dtype=object,
            object_codec=numcodecs.Pickle(),
        )
        is_filled = fd.create_dataset("is_filled",
                                      shape=(
                                          self.simulation_days,
                                          24,
                                          self.human_count,
                                      ),
                                      dtype=bool,
                                      fillvalue=False)
        total_dataset_bytes = 0
        sample_idx = 0
        current_day = 0
        dataset_cache_factory = lambda: np.zeros(shape=(24, self.human_count),
                                                 dtype=object)
        is_filled_cache_factory = lambda: np.zeros(
            shape=(24, self.human_count), dtype=bool)
        dataset_cache = dataset_cache_factory()
        is_filled_cache = is_filled_cache_factory()
        while not self.stop_flag.is_set():
            if self.reset_flag.is_set():
                self.time_counter.value = 0.0
                self.packet_counter.value = 0
                self.time_init.value = 0.0
                self.reset_flag.clear()
            try:
                buffer = socket.recv()
            except zmq.error.Again:
                continue
            proc_start_time = time.time()
            day_idx, hour_idx, human_idx, buffer = pickle.loads(buffer)
            total_dataset_bytes += len(buffer)
            if day_idx == (current_day + 1):
                # It's a new day
                # Dump the cache
                dataset[current_day, :, :] = dataset_cache
                is_filled[current_day, :, :] = is_filled_cache
                # Make a new cache
                dataset_cache = dataset_cache_factory()
                is_filled_cache = is_filled_cache_factory()
                # Update the current_day counter
                current_day += 1
            elif day_idx == current_day:
                pass
            else:
                raise RuntimeError(
                    f"The worker was at day {current_day}, but got a "
                    f"message from day {day_idx}. Bonk!")

            # Write the pickle and is_filled to cache
            dataset_cache[hour_idx, human_idx] = pickle.loads(buffer)
            is_filled_cache[hour_idx, human_idx] = True
            # Note to future self: this is what it used to be:
            #    dataset[day_idx, hour_idx, human_idx] = pickle.loads(buffer)
            #    is_filled[day_idx, hour_idx, human_idx] = True
            socket.send(str(sample_idx).encode())
            sample_idx += 1
            with self.time_counter.get_lock():
                self.time_counter.value += time.time() - proc_start_time
            with self.packet_counter.get_lock():
                self.packet_counter.value += 1
        self.running_flag.value = 0
        socket.close()
        dataset.attrs["total_samples"] = sample_idx
        dataset.attrs["total_bytes"] = total_dataset_bytes