def save_zarr(id_patient, lung_mask, nodule_mask): lung_mask_group.array(id_patient, lung_mask, chunks=(10, 1, 512, 512), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer()) nodule_mask_group.array(id_patient, nodule_mask, chunks=(10, 1, 512, 512), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer()) return
def save_zarr(id_patient, lung_mask, cand): lung_mask_group.array(id_patient, lung_mask, chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer()) cand_group.array(id_patient, cand, chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer()) return
def reset_states(self, input_shape=None): """Initialize the state space This method initializes the layer and resets any previously held data. The zarr array is initialized in this method. Args: input_shape (TensorShape,tuple, list): Shape of the input. """ if not isinstance(input_shape, type(None)): self._input_shape = input_shape if self._input_shape == None: raise ValueError( 'The input_shape is None, and no previous input ' + 'shape information was provided. The first time ' + 'reset_states is called, an input_shape must be ' + 'provided.') # Try to keep chunks limited to 16MB ncols = int(np.ceil(self._input_shape[self._channel_index] / 8)) nrows = 2**22 // ncols # Initialize internal variables related to state space self._state_ids = None self._edges = None self._index = None self._counts = None self._entropy = None self._threads = [] self._chunk_size = (nrows, ncols) self._state_shape = list(self._chunk_size) self._state_count = 0 if self._raw_states != None: # Zero out states and resize if zarr already open self._raw_states.resize(self._state_shape) self._raw_states[:] = 0 else: # Initialize the zarr array if self._zarr_path != None: if self._zarr_path.is_file(): self._zarr_path.unlink() self._raw_states = zarr.zeros( shape=self._state_shape, chunks=self._chunk_size, dtype='B', synchronizer=zarr.ThreadSynchronizer(), store=str(self._zarr_path.absolute())) else: self._raw_states = zarr.zeros( shape=self._state_shape, chunks=self._chunk_size, dtype='B', synchronizer=zarr.ThreadSynchronizer())
def _open_cache(self, location): if self.overwrite: self.cache = zarr.open(location, mode='w', shape=(self.cache_size,), chunks=(1,), dtype=object, object_codec=numcodecs.Pickle(), synchronizer=zarr.ThreadSynchronizer()) else: if os.path.exists(location): self.cache = zarr.open(location, mode='r', object_codec=numcodecs.Pickle(), synchronizer=zarr.ThreadSynchronizer())
def __append_vars(ds, store, dim, mode='serial'): print("Append vars") dataset = __nc_open(ds) store[dim].append(dataset[dim]) if mode == 'serial': for name in dataset.variables.keys(): __append_var(ds, store, name, dim) elif mode == 'processes': with ProcessPoolExecutor(max_workers=8) as executor: syncro = zarr.ProcessSynchronizer(SHARED + 'ntz.sync') for name in dataset.variables.keys(): executor.submit(__append_var, ds, store, name, dim, syncro) elif mode == 'threads': with ThreadPoolExecutor(max_workers=8) as executor: syncro = zarr.ThreadSynchronizer() for name in dataset.variables.keys(): executor.submit(__append_var, ds, store, name, dim, syncro) else: raise ValueError('the mode %s is not valid.' % mode)
def test_info(): # setup g = zarr.group(store=dict(), chunk_store=dict(), synchronizer=zarr.ThreadSynchronizer()) g.create_group('foo') z = g.zeros('bar', shape=10, filters=[numcodecs.Adler32()]) # test group info items = g.info_items() keys = sorted([k for k, _ in items]) expected_keys = sorted([ 'Type', 'Read-only', 'Synchronizer type', 'Store type', 'Chunk store type', 'No. members', 'No. arrays', 'No. groups', 'Arrays', 'Groups', 'Name' ]) assert expected_keys == keys # test array info items = z.info_items() keys = sorted([k for k, _ in items]) expected_keys = sorted([ 'Type', 'Data type', 'Shape', 'Chunk shape', 'Order', 'Read-only', 'Filter [0]', 'Compressor', 'Synchronizer type', 'Store type', 'Chunk store type', 'No. bytes', 'No. bytes stored', 'Storage ratio', 'Chunks initialized', 'Name' ]) assert expected_keys == keys
def test_info(array_size): # setup g = zarr.group(store=dict(), chunk_store=dict(), synchronizer=zarr.ThreadSynchronizer()) g.create_group('foo') z = g.zeros('bar', shape=array_size, filters=[numcodecs.Adler32()]) # test group info items = g.info_items() keys = sorted([k for k, _ in items]) expected_keys = sorted([ 'Type', 'Read-only', 'Synchronizer type', 'Store type', 'Chunk store type', 'No. members', 'No. arrays', 'No. groups', 'Arrays', 'Groups', 'Name' ]) assert expected_keys == keys # can also get a string representation of info via the info attribute assert isinstance(g.info, InfoReporter) assert "Type" in repr(g.info) # test array info items = z.info_items() keys = sorted([k for k, _ in items]) expected_keys = sorted([ 'Type', 'Data type', 'Shape', 'Chunk shape', 'Order', 'Read-only', 'Filter [0]', 'Compressor', 'Synchronizer type', 'Store type', 'Chunk store type', 'No. bytes', 'No. bytes stored', 'Storage ratio', 'Chunks initialized', 'Name' ]) assert expected_keys == keys # can also get a string representation of info via the info attribute assert isinstance(z.info, InfoReporter) assert "Type" in repr(z.info)
def test_run_simulator_with_threads_and_zarr_directory_store(): """ If the store is on disk (here a Zarr DirectoryStore), collect_in_memory can be set to False (but synchronization needs to be employed). """ cluster = LocalCluster(n_workers=2, processes=False, threads_per_worker=1) simulator = Simulator(model, sim_shapes=dict(x=(10, )), cluster=cluster) with tempfile.TemporaryDirectory() as tmpdir: pars = zarr.open(f"{tmpdir}/pars.zarr", shape=(100, 2)) pars[:, :] = np.random.random(pars.shape) x = zarr.open(f"{tmpdir}/x.zarr", shape=(100, 10), synchronizer=zarr.ThreadSynchronizer()) x[:, :] = 0.0 sims = dict(x=x.oindex) sim_status = zarr.open( f"{tmpdir}/sim_status.zarr", shape=(100, ), synchronizer=zarr.ThreadSynchronizer(), ) sim_status[:] = np.full(100, SimulationStatus.RUNNING, dtype="int") # the following is non-blocking (it immediately returns) simulator.run( pars=pars, sims=sims, sim_status=sim_status.oindex, indices=np.arange(100, dtype=np.int), collect_in_memory=False, batch_size=20, ) # need to wait for tasks to be completed _wait_for_all_tasks() assert np.all(sim_status[:] == SimulationStatus.FINISHED) assert not np.all(np.isclose(sims["x"][:, :].sum(axis=1), 0.0)) simulator.client.close() cluster.close()
def compress_zarr_dataset(data, file_path, compression='lz4', clevel=5, start_idx=0, end_idx=0): """ Loads in a zarr data set and exports it with a given compression type and level :param data: Zarr data set which will be compressed :param file_path: File name path where the data will be exported (e.g. "./export/data.zip") :param compression: Compression type :param clevel: Compression level :param start_idx: Starting index of data to be exported. :param end_idx: If end_idx != 0 the data set will be exported to the specified index, excluding the sample at end_idx (e.g. end_idx = len(x) will export it fully) :return: True if a NaN value was detected """ compressor = Blosc(cname=compression, clevel=clevel, shuffle=Blosc.SHUFFLE) # open a dataset file and create arrays store = zarr.ZipStore(file_path, mode="w") zarr_file = zarr.group(store=store, overwrite=True) nan_detected = False for key in data.keys(): if end_idx == 0: x = data[key] else: x = data[key][start_idx:end_idx] if np.isnan(x).any(): nan_detected = True array_shape = list(x.shape) array_shape[0] = 128 # export array zarr_file.create_dataset( name=key, data=x, shape=x.shape, dtype=type(x.flatten()[0]), chunks=array_shape, synchronizer=zarr.ThreadSynchronizer(), compression=compressor, ) store.close() logging.info("dataset was exported to: %s", file_path) return nan_detected
def __set_dims(ds, group, mode): dataset = __nc_open(ds) if mode == 'serial': for name in dataset.variables.keys(): __set_dim(ds, group, name) elif mode == 'processes': with ProcessPoolExecutor(max_workers=8) as executor: syncro = zarr.ProcessSynchronizer(SHARED + 'ntz.sync') for name in dataset.variables.keys(): executor.submit(__set_dim, ds, group, name, syncro) elif mode == 'threads': with ThreadPoolExecutor(max_workers=8) as executor: syncro = zarr.ThreadSynchronizer() for name in dataset.variables.keys(): executor.submit(__set_dim, ds, group, name, syncro) else: raise ValueError('the mode %s is not valid.' % mode)
def _create_window(name, ntime, nchan, nbl, ncorr, dtype, default, token, backend="numpy", path=None): if backend == "zarr-disk": return zarr.creation.create(shape=(nbl, ncorr, ntime, nchan), chunks=(1, ncorr, ntime, nchan), compressor=None, dtype=dtype, synchronizer=zarr.ThreadSynchronizer(), overwrite=True, fill_value=default, read_only=False, store=pjoin(path, "-".join((name, token)))) elif backend == "numpy": return np.full((nbl, ncorr, ntime, nchan), default, dtype=dtype) else: raise ValueError("Invalid backend '%s'" % backend)
def convert_to_zarr(self, str_beg=None, str_end=None, out_filename=None, out_path=None): """Create zarr file for data between datetime_beg and datetime_end.""" if not str_end: end = self._datetime_end else: end = datetime.datetime.strptime(str_end, '%Y-%m-%d') if not str_beg: beg = self._datetime_beg else: beg = datetime.datetime.strptime(str_beg, '%Y-%m-%d') if not out_filename: store = self._compose_out_filename(str_beg, str_end) + '.zarr' else: store = out_filename if out_path: store = os.path.join(out_path, store) # create hierarchy root = zarr.open(store, mode='w') # means create (fail if exists) # FIXME: Remove these root attributes root.attrs['DT'] = self.DT root.attrs['TZ'] = self.TZ raw = root.create_group('raw') # Zarr provides support for chunk-level synchronization. # This array is safe to read or write within a multi-threaded program. compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) self.z_raw = raw.zeros('source', shape=(len(self._sources), self._length_max), chunks=(1, self._length_max), dtype='i2', compressor=compressor, synchronizer=zarr.ThreadSynchronizer()) self.z_raw[:] = np.nan i = 0 axis_datetime = [] pool = Pool(processes=6) for item in self._sources: if beg <= item['datetime'] <= end: axis_datetime.append(item['datetime']) cur_filename = item['filename'] pool.apply_async(self.reader, args=( i, cur_filename, ), callback=self.log_result) i += 1 pool.close() pool.join() # print(self.result_list) # append created axis # FIXME: set an datetime axis (for given TZ!!!) # s -> seconds precision!!! z_created = raw.zeros('created', shape=(len(self._sources), ), dtype='M8[s]') z_created[:] = axis_datetime print(root.tree())
# In[2]: wvel_data = np.random.normal(2000, 1000, size=[8000,7500]).astype(np.float32) human_readable_size(wvel_data.nbytes) # ### Copy to a zarr file on disk, using multiple threads # In[3]: item='disk1_data' store = zarr.DirectoryStore(item) group=zarr.hierarchy.group(store=store,overwrite=True,synchronizer=zarr.ThreadSynchronizer()) the_var='wvel' out_zarr1=group.zeros(the_var,shape=wvel_data.shape,dtype=wvel_data.dtype,chunks=[2000,7500]) out_zarr1[...]=wvel_data[...] # ### Add some attributes # In[4]: now=datetime.datetime.now(pytz.UTC) timestamp= int(now.strftime('%s')) out_zarr1.attrs['history']='written for practice' out_zarr1.attrs['creation_date']=str(now) out_zarr1.attrs['gmt_timestap']=timestamp
# %% [markdown] # ### Create 230 Mbytes of fake data # %% wvel_data = np.random.normal(2000, 1000, size=[8000, 7500]).astype(np.float32) human_readable_size(wvel_data.nbytes) # %% [markdown] # ### Copy to a zarr file on disk, using multiple threads # %% item = 'disk1_data' store = zarr.DirectoryStore(item) group = zarr.hierarchy.group(store=store, overwrite=True, synchronizer=zarr.ThreadSynchronizer()) the_var = 'wvel' out_zarr1 = group.zeros(the_var, shape=wvel_data.shape, dtype=wvel_data.dtype, chunks=[2000, 7500]) out_zarr1[...] = wvel_data[...] # %% [markdown] # ### Add some attributes # %% now = datetime.datetime.now(pytz.UTC) timestamp = int(now.strftime('%s')) out_zarr1.attrs['history'] = 'written for practice' out_zarr1.attrs['creation_date'] = str(now)
def export_pgn_batch(self, cur_part, game_idx_start, game_idx_end, pgn_sel, nb_white_wins, nb_black_wins, nb_draws): """ Exports one part of the pgn-files of the current games selected. After the export of one part the memory can be freed of the local variables. If the function has been ran successfully a new dataset-partfile was created in the dataset export directory For loading and exporting multiprocessing is used :param cur_part: Current part (integer value which start at 0). :param game_idx_start: Starting game index of the selected game for this part :param game_idx_end: End game index of the current part :param pgn_sel: Selected PGN data which will be used for the export :param nb_white_wins: Number of games which white won in the current part :param nb_black_wins: Number of games which black won in the current part :param nb_draws: Number of draws in the current part :return: """ # create a param input list which will concatenate the pgn with it's corresponding game index params_inp = [] for i, pgn in enumerate(pgn_sel): game_idx = game_idx_start + i params_inp.append((pgn, game_idx, self._mate_in_one)) logging.info("starting conversion to planes...") t_s = time() p = Pool() x_dic = {} y_value_dic = {} y_policy_dic = {} metadata_dic = {} if not os.path.exists(self._export_dir): os.makedirs(self._export_dir) logging.info("the dataset_export directory was created at: %s", self._export_dir) # create a directory of the current timestmp if not os.path.exists(self._timestmp_dir): os.makedirs(self._timestmp_dir) # http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html zarr_path = self._timestmp_dir + self._pgn_name.replace(".pgn", "_" + str(cur_part) + ".zip") # open a dataset file and create arrays store = zarr.ZipStore(zarr_path, mode="w") zarr_file = zarr.group(store=store, overwrite=True) # the games occur in random order due to multiprocessing # in order to keep structure we store the result in a dictionary first for metadata, game_idx, x, y_value, y_policy in p.map(get_planes_from_pgn, params_inp): metadata_dic[game_idx] = metadata x_dic[game_idx] = x y_value_dic[game_idx] = y_value y_policy_dic[game_idx] = y_policy p.close() p.join() t_e = time() - t_s logging.debug("elapsed time: %fs", t_e) t_mean = t_e / self._batch_size logging.debug("mean time for 1 game: %f ms", t_mean * 1000) # logging.debug('approx time for whole file (nb_games: %d): %fs', self._nb_games, t_mean * self._nb_games) # now we can convert the dictionary to a list metadata = get_dic_sorted_by_key(metadata_dic) x = get_dic_sorted_by_key(x_dic) y_value = get_dic_sorted_by_key(y_value_dic) y_policy = get_dic_sorted_by_key(y_policy_dic) # create a list which describes where each game starts start_indices = np.zeros(len(x)) for i, x_cur in enumerate(x[:-1]): start_indices[i + 1] = start_indices[i] + len(x_cur) # next we stack the list into a numpy-array metadata = np.concatenate(metadata, axis=0) x = np.concatenate(x, axis=0) y_value = np.concatenate(y_value, axis=0) y_policy = np.concatenate(y_policy, axis=0) logging.debug("metadata.shape %s", metadata.shape) logging.debug("x.shape %s", x.shape) logging.debug("y_value.shape %s", y_value.shape) logging.debug("y_policy.shape %s", y_policy.shape) # Save the dataset to a file logging.info("saving the dataset to a file...") # define the compressor object compressor = Blosc(cname=self._compression, clevel=self._clevel, shuffle=Blosc.SHUFFLE) # export the metadata zarr_file.create_dataset( name="metadata", data=metadata, shape=metadata.shape, dtype=metadata.dtype, synchronizer=zarr.ThreadSynchronizer(), compression=compressor, ) # export the images zarr_file.create_dataset( name="x", data=x, shape=x.shape, dtype=np.int16, chunks=(128, x.shape[1], x.shape[2], x.shape[3]), synchronizer=zarr.ThreadSynchronizer(), compression=compressor, ) # create the label arrays and copy the labels data in them zarr_file.create_dataset( name="y_value", shape=y_value.shape, dtype=np.int16, data=y_value, synchronizer=zarr.ThreadSynchronizer() ) zarr_file.create_dataset( name="y_policy", shape=y_policy.shape, dtype=np.int16, data=y_policy, chunks=(128, y_policy.shape[1]), synchronizer=zarr.ThreadSynchronizer(), compression=compressor, ) zarr_file.create_dataset( name="start_indices", shape=start_indices.shape, dtype=np.int32, data=start_indices, synchronizer=zarr.ThreadSynchronizer(), compression=compressor, ) # export the parameter settings and statistics of the file zarr_file.create_group("/parameters") zarr_file.create_dataset( name="/parameters/pgn_name", shape=(1,), dtype="S" + str(len(self._pgn_name) + 1), data=[self._pgn_name.encode("ascii", "ignore")], compression=compressor, ) zarr_file.create_dataset( name="/parameters/limit_nb_games", data=[self._limit_nb_games], shape=(1,), dtype=np.int16, compression=compressor, ) zarr_file.create_dataset( name="/parameters/batch_size", shape=(1,), dtype=np.int16, data=[self._batch_size], compression=compressor ) zarr_file.create_dataset( name="/parameters/max_nb_files", shape=(1,), dtype=np.int16, data=[self._max_nb_files], compression=compressor, ) zarr_file.create_dataset( name="/parameters/min_elo_both", shape=(1,), dtype=np.int16, data=[self._min_elo_both], compression=compressor, ) if self._compression is not None: zarr_file.create_dataset( "/parameters/compression", shape=(1,), dtype="S" + str(len(self._compression) + 1), data=[self._compression.encode("ascii", "ignore")], compression=compressor, ) # https://stackoverflow.com/questions/23220513/storing-a-list-of-strings-to-a-hdf5-dataset-from-python ascii_list = [n.encode("ascii", "ignore") for n in self._termination_conditions] max_length = max(len(s) for s in self._termination_conditions) zarr_file.create_dataset( "/parameters/termination_conditions", shape=(1, 1), dtype="S" + str(max_length), data=ascii_list, compression=compressor, ) zarr_file.create_group("/statistics") zarr_file.create_dataset( "/statistics/number_selected_games", shape=(1,), dtype=np.int16, data=[len(pgn_sel)], compression=compressor ) zarr_file.create_dataset( "/statistics/game_idx_start", shape=(1,), dtype=np.int16, data=[game_idx_start], compression=compressor ) zarr_file.create_dataset( "/statistics/game_idx_end", shape=(1,), dtype=np.int16, data=[game_idx_end], compression=compressor ) zarr_file.create_dataset( "/statistics/white_wins", shape=(1,), dtype=np.int16, data=[nb_white_wins], compression=compressor ) zarr_file.create_dataset( "/statistics/black_wins", shape=(1,), dtype=np.int16, data=[nb_black_wins], compression=compressor ) zarr_file.create_dataset( "/statistics/draws", shape=(1,), dtype=np.int16, data=[nb_draws], compression=compressor ) store.close() logging.debug("dataset was exported to: %s", zarr_path) return True
def save_cands(id_patient, cands): candidates.array(id_patient, cands, chunks=(40, 1, 512, 512), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer())
def save_cands(id_patient, cands): cands_resized.array(id_patient, cands, chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), synchronizer=zarr.ThreadSynchronizer())