コード例 #1
0
ファイル: zarr_compress_2d.py プロジェクト: Azurequeen/python
def save_zarr(id_patient, lung_mask, nodule_mask):
    lung_mask_group.array(id_patient, lung_mask, 
            chunks=(10, 1, 512, 512), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), 
            synchronizer=zarr.ThreadSynchronizer())
    nodule_mask_group.array(id_patient, nodule_mask, 
            chunks=(10, 1, 512, 512), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), 
            synchronizer=zarr.ThreadSynchronizer())
    return
コード例 #2
0
ファイル: zarr_compress_3d.py プロジェクト: Azurequeen/python
def save_zarr(id_patient, lung_mask, cand):
    lung_mask_group.array(id_patient, lung_mask, 
            chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), 
            synchronizer=zarr.ThreadSynchronizer())
    cand_group.array(id_patient, cand, 
            chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2), 
            synchronizer=zarr.ThreadSynchronizer())
    return
コード例 #3
0
    def reset_states(self, input_shape=None):
        """Initialize the state space

        This method initializes the layer and resets any previously held data.
        The zarr array is initialized in this method.

        Args:
            input_shape (TensorShape,tuple, list): Shape of the input.
        """

        if not isinstance(input_shape, type(None)):
            self._input_shape = input_shape

        if self._input_shape == None:
            raise ValueError(
                'The input_shape is None, and no previous input ' +
                'shape information was provided. The first time ' +
                'reset_states is called, an input_shape must be ' +
                'provided.')

        # Try to keep chunks limited to 16MB
        ncols = int(np.ceil(self._input_shape[self._channel_index] / 8))
        nrows = 2**22 // ncols

        # Initialize internal variables related to state space
        self._state_ids = None
        self._edges = None
        self._index = None
        self._counts = None
        self._entropy = None
        self._threads = []
        self._chunk_size = (nrows, ncols)
        self._state_shape = list(self._chunk_size)
        self._state_count = 0

        if self._raw_states != None:
            # Zero out states and resize if zarr already open
            self._raw_states.resize(self._state_shape)
            self._raw_states[:] = 0
        else:
            # Initialize the zarr array
            if self._zarr_path != None:
                if self._zarr_path.is_file():
                    self._zarr_path.unlink()

                self._raw_states = zarr.zeros(
                    shape=self._state_shape,
                    chunks=self._chunk_size,
                    dtype='B',
                    synchronizer=zarr.ThreadSynchronizer(),
                    store=str(self._zarr_path.absolute()))
            else:
                self._raw_states = zarr.zeros(
                    shape=self._state_shape,
                    chunks=self._chunk_size,
                    dtype='B',
                    synchronizer=zarr.ThreadSynchronizer())
コード例 #4
0
ファイル: transforms.py プロジェクト: hanabhp/otoworld
 def _open_cache(self, location):
     if self.overwrite:
         self.cache = zarr.open(location, mode='w', shape=(self.cache_size,),
                                chunks=(1,), dtype=object,
                                object_codec=numcodecs.Pickle(),
                                synchronizer=zarr.ThreadSynchronizer())
     else:
         if os.path.exists(location):
             self.cache = zarr.open(location, mode='r',
                                    object_codec=numcodecs.Pickle(),
                                    synchronizer=zarr.ThreadSynchronizer())
コード例 #5
0
ファイル: convert.py プロジェクト: bilts/netcdf-to-zarr
def __append_vars(ds, store, dim, mode='serial'):

    print("Append vars")
    dataset = __nc_open(ds)

    store[dim].append(dataset[dim])

    if mode == 'serial':
        for name in dataset.variables.keys():
            __append_var(ds, store, name, dim)

    elif mode == 'processes':
        with ProcessPoolExecutor(max_workers=8) as executor:
            syncro = zarr.ProcessSynchronizer(SHARED + 'ntz.sync')
            for name in dataset.variables.keys():
                executor.submit(__append_var, ds, store, name, dim, syncro)

    elif mode == 'threads':
        with ThreadPoolExecutor(max_workers=8) as executor:
            syncro = zarr.ThreadSynchronizer()
            for name in dataset.variables.keys():
                executor.submit(__append_var, ds, store, name, dim, syncro)

    else:
        raise ValueError('the mode %s is not valid.' % mode)
コード例 #6
0
ファイル: test_info.py プロジェクト: tjcrone/zarr
def test_info():

    # setup
    g = zarr.group(store=dict(),
                   chunk_store=dict(),
                   synchronizer=zarr.ThreadSynchronizer())
    g.create_group('foo')
    z = g.zeros('bar', shape=10, filters=[numcodecs.Adler32()])

    # test group info
    items = g.info_items()
    keys = sorted([k for k, _ in items])
    expected_keys = sorted([
        'Type', 'Read-only', 'Synchronizer type', 'Store type',
        'Chunk store type', 'No. members', 'No. arrays', 'No. groups',
        'Arrays', 'Groups', 'Name'
    ])
    assert expected_keys == keys

    # test array info
    items = z.info_items()
    keys = sorted([k for k, _ in items])
    expected_keys = sorted([
        'Type', 'Data type', 'Shape', 'Chunk shape', 'Order', 'Read-only',
        'Filter [0]', 'Compressor', 'Synchronizer type', 'Store type',
        'Chunk store type', 'No. bytes', 'No. bytes stored', 'Storage ratio',
        'Chunks initialized', 'Name'
    ])
    assert expected_keys == keys
コード例 #7
0
def test_info(array_size):

    # setup
    g = zarr.group(store=dict(), chunk_store=dict(),
                   synchronizer=zarr.ThreadSynchronizer())
    g.create_group('foo')
    z = g.zeros('bar', shape=array_size, filters=[numcodecs.Adler32()])

    # test group info
    items = g.info_items()
    keys = sorted([k for k, _ in items])
    expected_keys = sorted([
        'Type', 'Read-only', 'Synchronizer type', 'Store type', 'Chunk store type',
        'No. members', 'No. arrays', 'No. groups', 'Arrays', 'Groups', 'Name'
    ])
    assert expected_keys == keys

    # can also get a string representation of info via the info attribute
    assert isinstance(g.info, InfoReporter)
    assert "Type" in repr(g.info)

    # test array info
    items = z.info_items()
    keys = sorted([k for k, _ in items])
    expected_keys = sorted([
        'Type', 'Data type', 'Shape', 'Chunk shape', 'Order', 'Read-only', 'Filter [0]',
        'Compressor', 'Synchronizer type', 'Store type', 'Chunk store type', 'No. bytes',
        'No. bytes stored', 'Storage ratio', 'Chunks initialized', 'Name'
    ])
    assert expected_keys == keys

    # can also get a string representation of info via the info attribute
    assert isinstance(z.info, InfoReporter)
    assert "Type" in repr(z.info)
コード例 #8
0
ファイル: simulator_test.py プロジェクト: adam-coogan/swyft
def test_run_simulator_with_threads_and_zarr_directory_store():
    """
    If the store is on disk (here a Zarr DirectoryStore), collect_in_memory can
    be set to False (but synchronization needs to be employed).
    """
    cluster = LocalCluster(n_workers=2, processes=False, threads_per_worker=1)
    simulator = Simulator(model, sim_shapes=dict(x=(10, )), cluster=cluster)

    with tempfile.TemporaryDirectory() as tmpdir:
        pars = zarr.open(f"{tmpdir}/pars.zarr", shape=(100, 2))
        pars[:, :] = np.random.random(pars.shape)
        x = zarr.open(f"{tmpdir}/x.zarr",
                      shape=(100, 10),
                      synchronizer=zarr.ThreadSynchronizer())
        x[:, :] = 0.0
        sims = dict(x=x.oindex)
        sim_status = zarr.open(
            f"{tmpdir}/sim_status.zarr",
            shape=(100, ),
            synchronizer=zarr.ThreadSynchronizer(),
        )
        sim_status[:] = np.full(100, SimulationStatus.RUNNING, dtype="int")

        # the following is non-blocking (it immediately returns)
        simulator.run(
            pars=pars,
            sims=sims,
            sim_status=sim_status.oindex,
            indices=np.arange(100, dtype=np.int),
            collect_in_memory=False,
            batch_size=20,
        )

        # need to wait for tasks to be completed
        _wait_for_all_tasks()

        assert np.all(sim_status[:] == SimulationStatus.FINISHED)
        assert not np.all(np.isclose(sims["x"][:, :].sum(axis=1), 0.0))
    simulator.client.close()
    cluster.close()
コード例 #9
0
def compress_zarr_dataset(data,
                          file_path,
                          compression='lz4',
                          clevel=5,
                          start_idx=0,
                          end_idx=0):
    """
    Loads in a zarr data set and exports it with a given compression type and level
    :param data: Zarr data set which will be compressed
    :param file_path: File name path where the data will be exported (e.g. "./export/data.zip")
    :param compression: Compression type
    :param clevel: Compression level
    :param start_idx: Starting index of data to be exported.
    :param end_idx: If end_idx != 0 the data set will be exported to the specified index,
    excluding the sample at end_idx (e.g. end_idx = len(x) will export it fully)
    :return: True if a NaN value was detected
    """
    compressor = Blosc(cname=compression, clevel=clevel, shuffle=Blosc.SHUFFLE)

    # open a dataset file and create arrays
    store = zarr.ZipStore(file_path, mode="w")
    zarr_file = zarr.group(store=store, overwrite=True)

    nan_detected = False
    for key in data.keys():
        if end_idx == 0:
            x = data[key]
        else:
            x = data[key][start_idx:end_idx]

        if np.isnan(x).any():
            nan_detected = True

        array_shape = list(x.shape)
        array_shape[0] = 128
        # export array
        zarr_file.create_dataset(
            name=key,
            data=x,
            shape=x.shape,
            dtype=type(x.flatten()[0]),
            chunks=array_shape,
            synchronizer=zarr.ThreadSynchronizer(),
            compression=compressor,
        )
    store.close()
    logging.info("dataset was exported to: %s", file_path)
    return nan_detected
コード例 #10
0
ファイル: convert.py プロジェクト: bilts/netcdf-to-zarr
def __set_dims(ds, group, mode):
    dataset = __nc_open(ds)
    if mode == 'serial':
        for name in dataset.variables.keys():
            __set_dim(ds, group, name)

    elif mode == 'processes':
        with ProcessPoolExecutor(max_workers=8) as executor:
            syncro = zarr.ProcessSynchronizer(SHARED + 'ntz.sync')
            for name in dataset.variables.keys():
                executor.submit(__set_dim, ds, group, name, syncro)

    elif mode == 'threads':
        with ThreadPoolExecutor(max_workers=8) as executor:
            syncro = zarr.ThreadSynchronizer()
            for name in dataset.variables.keys():
                executor.submit(__set_dim, ds, group, name, syncro)

    else:
        raise ValueError('the mode %s is not valid.' % mode)
コード例 #11
0
ファイル: packing.py プロジェクト: ska-sa/tricolour
def _create_window(name,
                   ntime,
                   nchan,
                   nbl,
                   ncorr,
                   dtype,
                   default,
                   token,
                   backend="numpy",
                   path=None):
    if backend == "zarr-disk":
        return zarr.creation.create(shape=(nbl, ncorr, ntime, nchan),
                                    chunks=(1, ncorr, ntime, nchan),
                                    compressor=None,
                                    dtype=dtype,
                                    synchronizer=zarr.ThreadSynchronizer(),
                                    overwrite=True,
                                    fill_value=default,
                                    read_only=False,
                                    store=pjoin(path, "-".join((name, token))))
    elif backend == "numpy":
        return np.full((nbl, ncorr, ntime, nchan), default, dtype=dtype)
    else:
        raise ValueError("Invalid backend '%s'" % backend)
コード例 #12
0
ファイル: source.py プロジェクト: askazik/ion
    def convert_to_zarr(self,
                        str_beg=None,
                        str_end=None,
                        out_filename=None,
                        out_path=None):
        """Create zarr file for data between datetime_beg and datetime_end."""

        if not str_end:
            end = self._datetime_end
        else:
            end = datetime.datetime.strptime(str_end, '%Y-%m-%d')

        if not str_beg:
            beg = self._datetime_beg
        else:
            beg = datetime.datetime.strptime(str_beg, '%Y-%m-%d')

        if not out_filename:
            store = self._compose_out_filename(str_beg, str_end) + '.zarr'
        else:
            store = out_filename

        if out_path:
            store = os.path.join(out_path, store)

        # create hierarchy
        root = zarr.open(store, mode='w')  # means create (fail if exists)
        # FIXME: Remove these root attributes
        root.attrs['DT'] = self.DT
        root.attrs['TZ'] = self.TZ
        raw = root.create_group('raw')

        # Zarr provides support for chunk-level synchronization.
        # This array is safe to read or write within a multi-threaded program.
        compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)
        self.z_raw = raw.zeros('source',
                               shape=(len(self._sources), self._length_max),
                               chunks=(1, self._length_max),
                               dtype='i2',
                               compressor=compressor,
                               synchronizer=zarr.ThreadSynchronizer())
        self.z_raw[:] = np.nan

        i = 0
        axis_datetime = []
        pool = Pool(processes=6)
        for item in self._sources:
            if beg <= item['datetime'] <= end:
                axis_datetime.append(item['datetime'])
                cur_filename = item['filename']
                pool.apply_async(self.reader,
                                 args=(
                                     i,
                                     cur_filename,
                                 ),
                                 callback=self.log_result)
                i += 1
        pool.close()
        pool.join()
        # print(self.result_list)

        # append created axis
        # FIXME: set an datetime axis (for given TZ!!!)
        # s -> seconds precision!!!
        z_created = raw.zeros('created',
                              shape=(len(self._sources), ),
                              dtype='M8[s]')
        z_created[:] = axis_datetime

        print(root.tree())
コード例 #13
0
# In[2]:


wvel_data = np.random.normal(2000, 1000, size=[8000,7500]).astype(np.float32)
human_readable_size(wvel_data.nbytes)


# ### Copy to a zarr file on disk, using multiple threads

# In[3]:


item='disk1_data'
store = zarr.DirectoryStore(item)
group=zarr.hierarchy.group(store=store,overwrite=True,synchronizer=zarr.ThreadSynchronizer())
the_var='wvel'
out_zarr1=group.zeros(the_var,shape=wvel_data.shape,dtype=wvel_data.dtype,chunks=[2000,7500])
out_zarr1[...]=wvel_data[...]


# ### Add some attributes

# In[4]:


now=datetime.datetime.now(pytz.UTC)
timestamp= int(now.strftime('%s'))
out_zarr1.attrs['history']='written for practice'
out_zarr1.attrs['creation_date']=str(now)
out_zarr1.attrs['gmt_timestap']=timestamp
コード例 #14
0
# %% [markdown]
# ### Create 230 Mbytes of  fake data

# %%
wvel_data = np.random.normal(2000, 1000, size=[8000, 7500]).astype(np.float32)
human_readable_size(wvel_data.nbytes)

# %% [markdown]
# ### Copy to a zarr file on disk, using multiple threads

# %%
item = 'disk1_data'
store = zarr.DirectoryStore(item)
group = zarr.hierarchy.group(store=store,
                             overwrite=True,
                             synchronizer=zarr.ThreadSynchronizer())
the_var = 'wvel'
out_zarr1 = group.zeros(the_var,
                        shape=wvel_data.shape,
                        dtype=wvel_data.dtype,
                        chunks=[2000, 7500])
out_zarr1[...] = wvel_data[...]

# %% [markdown]
# ### Add some attributes

# %%
now = datetime.datetime.now(pytz.UTC)
timestamp = int(now.strftime('%s'))
out_zarr1.attrs['history'] = 'written for practice'
out_zarr1.attrs['creation_date'] = str(now)
コード例 #15
0
    def export_pgn_batch(self, cur_part, game_idx_start, game_idx_end, pgn_sel, nb_white_wins, nb_black_wins, nb_draws):
        """
        Exports one part of the pgn-files of the current games selected.
        After the export of one part the memory can be freed of the local variables.
        If the function has been ran successfully a new dataset-partfile was created in the dataset export directory
        For loading and exporting multiprocessing is used

        :param cur_part: Current part (integer value which start at 0).
        :param game_idx_start: Starting game index of the selected game for this part
        :param game_idx_end: End game index of the current part
        :param pgn_sel: Selected PGN data which will be used for the export
        :param nb_white_wins: Number of games which white won in the current part
        :param nb_black_wins: Number of games which black won in the current part
        :param nb_draws: Number of draws in the current part
        :return:
        """

        # create a param input list which will concatenate the pgn with it's corresponding game index
        params_inp = []
        for i, pgn in enumerate(pgn_sel):
            game_idx = game_idx_start + i

            params_inp.append((pgn, game_idx, self._mate_in_one))

        logging.info("starting conversion to planes...")
        t_s = time()
        p = Pool()

        x_dic = {}
        y_value_dic = {}
        y_policy_dic = {}

        metadata_dic = {}

        if not os.path.exists(self._export_dir):
            os.makedirs(self._export_dir)
            logging.info("the dataset_export directory was created at: %s", self._export_dir)

        # create a directory of the current timestmp
        if not os.path.exists(self._timestmp_dir):
            os.makedirs(self._timestmp_dir)

        # http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html
        zarr_path = self._timestmp_dir + self._pgn_name.replace(".pgn", "_" + str(cur_part) + ".zip")

        # open a dataset file and create arrays
        store = zarr.ZipStore(zarr_path, mode="w")

        zarr_file = zarr.group(store=store, overwrite=True)

        # the games occur in random order due to multiprocessing
        # in order to keep structure we store the result in a dictionary first
        for metadata, game_idx, x, y_value, y_policy in p.map(get_planes_from_pgn, params_inp):
            metadata_dic[game_idx] = metadata
            x_dic[game_idx] = x
            y_value_dic[game_idx] = y_value
            y_policy_dic[game_idx] = y_policy

        p.close()
        p.join()
        t_e = time() - t_s
        logging.debug("elapsed time: %fs", t_e)
        t_mean = t_e / self._batch_size
        logging.debug("mean time for 1 game: %f ms", t_mean * 1000)
        # logging.debug('approx time for whole file (nb_games: %d): %fs', self._nb_games, t_mean * self._nb_games)

        # now we can convert the dictionary to a list
        metadata = get_dic_sorted_by_key(metadata_dic)
        x = get_dic_sorted_by_key(x_dic)
        y_value = get_dic_sorted_by_key(y_value_dic)
        y_policy = get_dic_sorted_by_key(y_policy_dic)

        # create a list which describes where each game starts
        start_indices = np.zeros(len(x))
        for i, x_cur in enumerate(x[:-1]):
            start_indices[i + 1] = start_indices[i] + len(x_cur)

        # next we stack the list into a numpy-array
        metadata = np.concatenate(metadata, axis=0)
        x = np.concatenate(x, axis=0)
        y_value = np.concatenate(y_value, axis=0)
        y_policy = np.concatenate(y_policy, axis=0)

        logging.debug("metadata.shape %s", metadata.shape)
        logging.debug("x.shape %s", x.shape)
        logging.debug("y_value.shape %s", y_value.shape)
        logging.debug("y_policy.shape %s", y_policy.shape)

        # Save the dataset to a file
        logging.info("saving the dataset to a file...")

        # define the compressor object
        compressor = Blosc(cname=self._compression, clevel=self._clevel, shuffle=Blosc.SHUFFLE)

        # export the metadata
        zarr_file.create_dataset(
            name="metadata",
            data=metadata,
            shape=metadata.shape,
            dtype=metadata.dtype,
            synchronizer=zarr.ThreadSynchronizer(),
            compression=compressor,
        )

        # export the images
        zarr_file.create_dataset(
            name="x",
            data=x,
            shape=x.shape,
            dtype=np.int16,
            chunks=(128, x.shape[1], x.shape[2], x.shape[3]),
            synchronizer=zarr.ThreadSynchronizer(),
            compression=compressor,
        )

        # create the label arrays and copy the labels data in them
        zarr_file.create_dataset(
            name="y_value", shape=y_value.shape, dtype=np.int16, data=y_value, synchronizer=zarr.ThreadSynchronizer()
        )
        zarr_file.create_dataset(
            name="y_policy",
            shape=y_policy.shape,
            dtype=np.int16,
            data=y_policy,
            chunks=(128, y_policy.shape[1]),
            synchronizer=zarr.ThreadSynchronizer(),
            compression=compressor,
        )

        zarr_file.create_dataset(
            name="start_indices",
            shape=start_indices.shape,
            dtype=np.int32,
            data=start_indices,
            synchronizer=zarr.ThreadSynchronizer(),
            compression=compressor,
        )

        # export the parameter settings and statistics of the file
        zarr_file.create_group("/parameters")

        zarr_file.create_dataset(
            name="/parameters/pgn_name",
            shape=(1,),
            dtype="S" + str(len(self._pgn_name) + 1),
            data=[self._pgn_name.encode("ascii", "ignore")],
            compression=compressor,
        )

        zarr_file.create_dataset(
            name="/parameters/limit_nb_games",
            data=[self._limit_nb_games],
            shape=(1,),
            dtype=np.int16,
            compression=compressor,
        )
        zarr_file.create_dataset(
            name="/parameters/batch_size", shape=(1,), dtype=np.int16, data=[self._batch_size], compression=compressor
        )
        zarr_file.create_dataset(
            name="/parameters/max_nb_files",
            shape=(1,),
            dtype=np.int16,
            data=[self._max_nb_files],
            compression=compressor,
        )
        zarr_file.create_dataset(
            name="/parameters/min_elo_both",
            shape=(1,),
            dtype=np.int16,
            data=[self._min_elo_both],
            compression=compressor,
        )
        if self._compression is not None:
            zarr_file.create_dataset(
                "/parameters/compression",
                shape=(1,),
                dtype="S" + str(len(self._compression) + 1),
                data=[self._compression.encode("ascii", "ignore")],
                compression=compressor,
            )
        # https://stackoverflow.com/questions/23220513/storing-a-list-of-strings-to-a-hdf5-dataset-from-python
        ascii_list = [n.encode("ascii", "ignore") for n in self._termination_conditions]
        max_length = max(len(s) for s in self._termination_conditions)
        zarr_file.create_dataset(
            "/parameters/termination_conditions",
            shape=(1, 1),
            dtype="S" + str(max_length),
            data=ascii_list,
            compression=compressor,
        )

        zarr_file.create_group("/statistics")
        zarr_file.create_dataset(
            "/statistics/number_selected_games", shape=(1,), dtype=np.int16, data=[len(pgn_sel)], compression=compressor
        )
        zarr_file.create_dataset(
            "/statistics/game_idx_start", shape=(1,), dtype=np.int16, data=[game_idx_start], compression=compressor
        )
        zarr_file.create_dataset(
            "/statistics/game_idx_end", shape=(1,), dtype=np.int16, data=[game_idx_end], compression=compressor
        )
        zarr_file.create_dataset(
            "/statistics/white_wins", shape=(1,), dtype=np.int16, data=[nb_white_wins], compression=compressor
        )
        zarr_file.create_dataset(
            "/statistics/black_wins", shape=(1,), dtype=np.int16, data=[nb_black_wins], compression=compressor
        )
        zarr_file.create_dataset(
            "/statistics/draws", shape=(1,), dtype=np.int16, data=[nb_draws], compression=compressor
        )

        store.close()

        logging.debug("dataset was exported to: %s", zarr_path)

        return True
コード例 #16
0
def save_cands(id_patient, cands):
    candidates.array(id_patient,
                     cands,
                     chunks=(40, 1, 512, 512),
                     compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2),
                     synchronizer=zarr.ThreadSynchronizer())
コード例 #17
0
def save_cands(id_patient, cands):
    cands_resized.array(id_patient, cands, 
            chunks=(1, 17, 21, 21), compressor=zarr.Blosc(clevel=9, cname="zstd", shuffle=2),
            synchronizer=zarr.ThreadSynchronizer())