Python ProcessSynchronizer Examples, zarr.ProcessSynchronizer Python Examples

Example #1

0

Show file

    def __init__(self, hparams, dense):
        super().__init__()
        self.hparams = hparams
        self.dense = dense
        self.dummy = torch.nn.Linear(1, 1)
        # output
        out_path = self._get_output_path()
        init_shape = chunks = (self.hparams.batch_size * 2,
                               self.dense.get_dim())
        self.batch_idx = 0
        # self.fout = zarr.open(zarr.ZipStore(out_path), mode="w")
        self.fout = zarr.open(out_path, mode="w")
        q_sync = zarr.ProcessSynchronizer("sync/query.sync")
        pd_sync = zarr.ProcessSynchronizer("sync/doc_pos.sync")
        nd_sync = zarr.ProcessSynchronizer("sync/doc_neg.sync")

        self.out_query = self.fout.zeros(
            "query",
            shape=init_shape,
            chunks=chunks,
            synchronizer=q_sync,
        )
        self.out_doc_pos = self.fout.zeros(
            "doc_pos",
            shape=init_shape,
            chunks=chunks,
            synchronizer=pd_sync,
        )
        self.out_doc_neg = self.fout.zeros(
            "doc_neg",
            shape=init_shape,
            chunks=chunks,
            synchronizer=nd_sync,
        )

Example #2

0

Show file

File: xarray_helpers.py Project: billshi-NOAA/kluster

def xarr_to_zarr(xarr: xr.Dataset, outputpth: str, attrs: dict = None):
    """
    Takes in an xarray Dataset and pushes it to zarr store.

    Must be run once to generate new store.  Successive runs append, see mode flag

    Parameters
    ----------
    xarr
        xarray Dataset to write to zarr
    outputpth
        path to the zarr rootgroup folder to write
    attrs
        optional attribution to write to zarr

    Returns
    -------
    str
        path to the zarr group
    """

    # grpname = str(datetime.now().strftime('%H%M%S%f'))
    if attrs is not None:
        xarr.attrs = attrs

    if not os.path.exists(outputpth):
        xarr.to_zarr(outputpth, mode='w-', compute=False)
    else:
        sync = zarr.ProcessSynchronizer(outputpth + '.sync')
        xarr.to_zarr(outputpth, mode='a', synchronizer=sync, compute=False, append_dim='time')

    return outputpth

Example #3

0

Show file

File: concurrent-cubegen-test.py Project: dzelge/xcube

def main(args=None):
    args = args if args is not None else sys.argv[1:]
    if len(args) != 2:
        print(f'Usage: {sys.argv[0]} OUTPUT.zarr (INPUT.nc | INPUT.dir)')
        sys.exit(2)

    output_dir = args[0]
    input_file = args[1]

    if os.path.isdir(input_file):
        input_dir = input_file
        input_files = list(os.listdir(input_dir))
        # Shuffle files
        for i in range(len(input_files)):
            i1 = random.randint(0, len(input_files) - 1)
            i2 = random.randint(0, len(input_files) - 1)
            t = input_files[i1]
            input_files[i1] = input_files[i2]
            input_files[i2] = t
        for input_file in input_files:
            print(f'processing {input_file}')
            subprocess.run([
                sys.executable, sys.argv[0], output_dir,
                os.path.join(input_dir, input_file)
            ])
        return

    synchronizer = zarr.ProcessSynchronizer(output_dir + '.sync')
    input_ds = xr.open_dataset(input_file, decode_times=False)
    dropped_vars = set(
        input_ds.data_vars.keys()) - {"analysed_sst", "analysis_error"}
    input_ds = input_ds.drop(dropped_vars)

    if not os.path.isdir(output_dir):
        compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2)
        encoding = dict()
        for var_name in input_ds.data_vars:
            new_var = input_ds[var_name]
            chunks = new_var.shape
            encoding[var_name] = {'compressor': compressor, 'chunks': chunks}
        input_ds.to_zarr(output_dir,
                         encoding=encoding,
                         synchronizer=synchronizer)
        print(f'written {input_file} to {output_dir}')
    else:
        # cube_ds = xr.open_zarr(output_dir, synchronizer=synchronizer)
        # cube_ds = xr.concat([cube_ds, input_ds], dim='time')
        # cube_ds.close()
        root_group = zarr.open(output_dir, mode='a', synchronizer=synchronizer)
        for var_name, var_array in root_group.arrays():
            if var_name in input_ds:
                var = input_ds[var_name]
                if 'time' in var.dims:
                    if var_name == 'time':
                        print('time:', var, var.values)
                    axis = var.dims.index('time')
                    # Note: all append operations are forced to be sequential!
                    # See https://github.com/zarr-developers/zarr/issues/75
                    var_array.append(var, axis=axis)
        print(f'appended {input_file} to {output_dir}')

Example #4

0

Show file

File: convert.py Project: bilts/netcdf-to-zarr

def __append_vars(ds, store, dim, mode='serial'):

    print("Append vars")
    dataset = __nc_open(ds)

    store[dim].append(dataset[dim])

    if mode == 'serial':
        for name in dataset.variables.keys():
            __append_var(ds, store, name, dim)

    elif mode == 'processes':
        with ProcessPoolExecutor(max_workers=8) as executor:
            syncro = zarr.ProcessSynchronizer(SHARED + 'ntz.sync')
            for name in dataset.variables.keys():
                executor.submit(__append_var, ds, store, name, dim, syncro)

    elif mode == 'threads':
        with ThreadPoolExecutor(max_workers=8) as executor:
            syncro = zarr.ThreadSynchronizer()
            for name in dataset.variables.keys():
                executor.submit(__append_var, ds, store, name, dim, syncro)

    else:
        raise ValueError('the mode %s is not valid.' % mode)

Example #5

0

Show file

File: _zarr.py Project: billshi-NOAA/kluster

    def _write_new_dataset_rootgroup(self, xarr: xr.Dataset, var_name: str, dims_of_arrays: dict, chunksize: tuple,
                                     startingshp: tuple):
        """
        Create a new rootgroup array from the input xarray Dataarray.  Use startingshp to resize the array to the
        expected shape of the array after ALL writes.  This must be the first write if there are multiple distributed
        writes.

        Parameters
        ----------
        xarr
            data to write to zarr
        var_name
            variable name
        dims_of_arrays
            where keys are array names and values list of dims/shape.  Example: 'beampointingangle': [['time', 'sector', 'beam'], (5000, 3, 400)]
        chunksize
            chunk shape used to create the zarr array
        startingshp
            desired shape for the rootgroup array, might be modified later for total beams if necessary. if finalsize
            is None (the case when this is not the first write in a set of distributed writes) this is still returned but not used.
        """

        sync = None
        if self.zarr_path:
            sync = zarr.ProcessSynchronizer(self.zarr_path + '.sync')
        newarr = self.rootgroup.create_dataset(var_name, shape=dims_of_arrays[var_name][1], chunks=chunksize,
                                               dtype=xarr[var_name].dtype, synchronizer=sync,
                                               fill_value=self._get_arr_nodatavalue(xarr[var_name].dtype))
        newarr[:] = xarr[var_name].values
        newarr.resize(startingshp)

Example #6

0

Show file

File: _zarr.py Project: billshi-NOAA/kluster

    def open(self):
        """
        Open the zarr data store, will create a new one if it does not exist.  Get all the existing array names.
        """

        sync = zarr.ProcessSynchronizer(self.zarr_path + '.sync')
        self.rootgroup = zarr.open(self.zarr_path, mode='a', synchronizer=sync)
        self.get_array_names()

Example #7

0

Show file

File: simulator_test.py Project: adam-coogan/swyft

def test_run_simulator_with_processes_and_zarr_directory_store():
    """
    If the store is on disk (here a Zarr DirectoryStore), collect_in_memory can
    be set to False (but synchronization needs to be employed).
    """
    cluster = LocalCluster(n_workers=2, processes=True, threads_per_worker=1)
    simulator = Simulator(model, sim_shapes=dict(x=(10, )), cluster=cluster)

    with tempfile.TemporaryDirectory() as tmpdir:
        pars = zarr.open(f"{tmpdir}/pars.zarr", shape=(100, 2))
        pars[:, :] = np.random.random(pars.shape)
        synchronizer = zarr.ProcessSynchronizer(path=f"{tmpdir}/x.sync")
        x = zarr.open(f"{tmpdir}/x.zarr",
                      shape=(100, 10),
                      synchronizer=synchronizer)
        x[:, :] = 0.0
        sims = dict(x=x.oindex)
        synchronizer = zarr.ProcessSynchronizer(
            path=f"{tmpdir}/sim_status.sync")
        sim_status = zarr.open(
            f"{tmpdir}/sim_status.zarr",
            shape=(100, ),
            synchronizer=synchronizer,
            dtype="int",
        )
        sim_status[:] = np.full(100, SimulationStatus.RUNNING, dtype="int")

        # the following is non-blocking (it immediately returns)
        simulator.run(
            pars=pars,
            sims=sims,
            sim_status=sim_status.oindex,
            indices=np.arange(100, dtype=np.int),
            collect_in_memory=False,
            batch_size=20,
        )

        # need to wait for tasks to be completed
        _wait_for_all_tasks()

        assert np.all(sim_status[:] == SimulationStatus.FINISHED)
        assert not np.all(np.isclose(sims["x"][:, :].sum(axis=1), 0.0))
    simulator.client.close()
    cluster.close()

Example #8

0

Show file

def reload_zarr_records(pth: str,
                        skip_dask: bool = False,
                        sort_by: str = None):
    """
    After writing new data to the zarr data store, you need to refresh the xarray Dataset object so that it
    sees the changes.  We do that here by just re-running open_zarr.

    All the keyword arguments set to False are there to correctly read the saved zarr arrays.  Mask_and_scale i've
    yet to configure properly, it will replace values equal to the fill_value attribute with NaN.  Even when
    fill_value is non-zero, it seems to replace zeros with NaN.  Setting it to false prevents this.  You can read
    more here:  http://xarray.pydata.org/en/stable/generated/xarray.open_zarr.html

    If you are running this outside of the normal dask-enabled workflow, self.client will be None and you will not
    have the distributed sync object.  I do this with reading attributes from the zarr datastore where I just need
    to open for a minute to get the attributes.

    Returns
    -------
    pth
        string, path to xarray Dataset stored as zarr datastore
    skip_dask
        if True, skip the dask process synchronizer as you are not running dask distributed
    sort_by
        optional, will sort by the dimension provided, if provided (ex: 'time')
    """

    if os.path.exists(pth):
        sync = zarr.ProcessSynchronizer(pth + '.sync')
        if not skip_dask:
            data = xr.open_zarr(pth,
                                synchronizer=sync,
                                consolidated=False,
                                mask_and_scale=False,
                                decode_coords=False,
                                decode_times=False,
                                decode_cf=False,
                                concat_characters=False)
        else:
            data = xr.open_zarr(pth,
                                synchronizer=None,
                                consolidated=False,
                                mask_and_scale=False,
                                decode_coords=False,
                                decode_times=False,
                                decode_cf=False,
                                concat_characters=False)
        if sort_by:
            return data.sortby(sort_by)
        else:
            return data
    else:
        print('Unable to reload, no paths found: {}'.format(pth))
        return None

Example #9

0

Show file

File: zarr_.py Project: jgrss/geowombat

def to_zarr(filename, data, window, chunks, root=None):

    """
    Writes data to a zarr file

    Args:
        filename (str): The output file name.
        data (ndarray): The data to write.
        window (namedtuple): A ``rasterio.window.Window`` object.
        chunks (int or tuple): The ``zarr`` chunks.
        root (Optional[object]): The ``zarr`` root.

    Returns:
        ``str``
    """

    p = Path(filename)

    f_base = p.name.split('.')[0]
    d_name = p.parent
    sub_dir = d_name.joinpath('sub_tmp_')
    zarr_file = sub_dir.joinpath('data.zarr').as_posix()

    # sub_dir.mkdir(parents=True, exist_ok=True)

    if not root:
        root = zarr.open(zarr_file, mode='r+')

    group_name = '{BASE}_y{Y:09d}_x{X:09d}_h{H:09d}_w{W:09d}'.format(BASE=f_base,
                                                                     Y=window.row_off,
                                                                     X=window.col_off,
                                                                     H=window.height,
                                                                     W=window.width)

    group = root.create_group(group_name)

    synchronizer = zarr.ProcessSynchronizer('data.sync')

    z = group.array('data',
                    data,
                    compressor=compressor,
                    dtype=data.dtype.name,
                    chunks=chunks,
                    synchronizer=synchronizer)

    group.attrs['row_off'] = window.row_off
    group.attrs['col_off'] = window.col_off
    group.attrs['height'] = window.height
    group.attrs['width'] = window.width

    return zarr_file

Example #10

0

Show file

File: store.py Project: NLeSC-GO-common-infrastructure/swyft

    def __init__(
        self,
        params: Union[int, list],
        zarr_store: Union[zarr.MemoryStore, zarr.DirectoryStore],
        simulator=None,
        sync_path: Optional[PathType] = None,
    ):
        """Initialize Store content dimensions.

        Args:
            params (list of strings or int): List of paramater names.  If int use ['z0', 'z1', ...].
            zarr_store: zarr storage.
            sync_path: path to the cache lock files. Must be accessible to all
                processes working on the cache.
        """
        self._zarr_store = zarr_store
        self._simulator = simulator

        if isinstance(params, int):
            params = ["z%i" % i for i in range(params)]
        self.params = params

        synchronizer = zarr.ProcessSynchronizer(sync_path) if sync_path else None
        self._root = zarr.group(store=self.zarr_store, synchronizer=synchronizer)

        logging.debug("  params = %s" % str(params))

        if set(["samples", "metadata"]) == set(self._root.keys()):
            logging.info("Loading existing store.")
            self._update()
        elif len(self._root.keys()) == 0:
            logging.info("Creating new store.")
            self._setup_new_zarr_store(
                len(self.params), simulator.sim_shapes, self._root
            )
            logging.debug("  sim_shapes = %s" % str(simulator.sim_shapes))
        else:
            raise KeyError(
                "The zarr storage is corrupted. It should either be empty or only have the keys ['samples', 'metadata']."
            )

        self._lock = None
        if sync_path is not None:
            self._setup_lock(sync_path)

Example #11

0

Show file

File: convert.py Project: bilts/netcdf-to-zarr

def __set_dims(ds, group, mode):
    dataset = __nc_open(ds)
    if mode == 'serial':
        for name in dataset.variables.keys():
            __set_dim(ds, group, name)

    elif mode == 'processes':
        with ProcessPoolExecutor(max_workers=8) as executor:
            syncro = zarr.ProcessSynchronizer(SHARED + 'ntz.sync')
            for name in dataset.variables.keys():
                executor.submit(__set_dim, ds, group, name, syncro)

    elif mode == 'threads':
        with ThreadPoolExecutor(max_workers=8) as executor:
            syncro = zarr.ThreadSynchronizer()
            for name in dataset.variables.keys():
                executor.submit(__set_dim, ds, group, name, syncro)

    else:
        raise ValueError('the mode %s is not valid.' % mode)

Example #12

0

Show file

File: _zarr.py Project: billshi-NOAA/kluster

def my_xarr_add_attribute(attrs: dict, outputpth: str):
    """
    Add the provided attrs dict to the existing attribution of the zarr instance at outputpth

    Parameters
    ----------
    attrs
        dictionary of combined attributes from xarray datasets, None if no attributes exist
    outputpth
        path to zarr group to either be created or append to

    Returns
    -------
    str
        path to the final zarr group
    """

    # mode 'a' means read/write, create if doesnt exist
    sync = zarr.ProcessSynchronizer(outputpth + '.sync')
    rootgroup = zarr.open(outputpth, mode='a', synchronizer=sync)
    _my_xarr_to_zarr_writeattributes(rootgroup, attrs)
    return outputpth

Example #13

0

Show file

File: select_agents.py Project: sudha-vijayakumar/CMPE258_LYFT_l5kit

def select_agents(
    zarr_dataset: ChunkedDataset,
    th_agent_prob: float,
    th_yaw_degree: float,
    th_extent_ratio: float,
    th_distance_av: float,
) -> None:
    """
    Filter agents from zarr INPUT_FOLDER according to multiple thresholds and store a boolean array of the same shape.
    """
    agents_mask_path = Path(zarr_dataset.path) / f"agents_mask/{th_agent_prob}"

    if agents_mask_path.exists():
        raise FileExistsError(
            f"{th_agent_prob} exists already! only one is supported!")

    frame_index_intervals = zarr_dataset.scenes["frame_index_interval"]

    # build a partial with all args except the first one (will be passed by threads)
    get_valid_agents_partial = partial(
        get_valid_agents,
        dataset=zarr_dataset,
        th_agent_filter_probability_threshold=th_agent_prob,
        th_yaw_degree=th_yaw_degree,
        th_extent_ratio=th_extent_ratio,
        th_distance_av=th_distance_av,
    )

    try:
        root = zarr.open(zarr_dataset.path, mode="a")
        root.create_group("agents_mask")
    except ValueError:
        pass  # group is already there

    agents_mask = zarr.open_array(
        str(agents_mask_path),
        mode="w",
        shape=(len(zarr_dataset.agents), 2),
        chunks=(10000, ),
        dtype=np.uint32,
        synchronizer=zarr.ProcessSynchronizer(
            Path(gettempdir()) / f"ag_mask_{str(uuid4())}.sync"),
    )

    report: Counter = Counter()
    print("starting pool...")
    with Pool(cpu_count()) as pool:
        tasks = tqdm(
            enumerate(
                pool.imap_unordered(get_valid_agents_partial,
                                    frame_index_intervals)))
        for idx, (mask, count, agents_range) in tasks:
            report += count
            agents_mask[agents_range[0]:agents_range[1]] = mask
            tasks.set_description(f"{idx + 1}/{len(frame_index_intervals)}")
        print("collecting results..")

    agents_cfg = {
        "th_agent_filter_probability_threshold": th_agent_prob,
        "th_yaw_degree": th_yaw_degree,
        "th_extent_ratio": th_extent_ratio,
        "th_distance_av": th_distance_av,
    }
    # print report
    pp = pprint.PrettyPrinter(indent=4)
    print(f"start report for {zarr_dataset.path}")
    pp.pprint({**agents_cfg, **report})

    future_steps = [0, 10, 30, 50]
    past_steps = [0, 10, 30, 50]
    agents_mask_np = np.asarray(agents_mask)

    table = PrettyTable(field_names=["past/future"] +
                        [str(step) for step in future_steps])
    for step_p in tqdm(past_steps, desc="computing past/future table"):
        row = [step_p]
        for step_f in future_steps:
            past_mask = agents_mask_np[:, 0] >= step_p
            future_mask = agents_mask_np[:, 1] >= step_f
            row.append(np.sum(past_mask * future_mask))
        table.add_row(row)
    print(table)
    print(f"end report for {zarr_dataset.path}")
    print("==============================")

Example #14

0

Show file

File: process_data.py Project: zhuwenjian/ai_for_ci

logging_format = '%(asctime)s - %(name)s - %(message)s'
logging.root.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO,
                    format=logging_format,
                    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger("ProcessData")

# radarBase = "/wave/mlp/cwb-ci/Radar/raw/"
# satBase = "/wave/mlp/cwb-ci/Satellite/raw/"
radarBase = "../../data/cwb-ci/Radar/raw/"
satBase = "../../data/cwb-ci/Satellite/raw/"

global compressor
compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2)
synchronizer = zarr.ProcessSynchronizer('example.sync')


def bilinear_resize(image, height, width):
    """
  `image` is a 2-D numpy array
  `height` and `width` are the desired spatial dimension of the new 2-D array.
  """
    img_height, img_width = image.shape

    image = image.ravel()

    x_ratio = float(img_width - 1) / (width - 1) if width > 1 else 0
    y_ratio = float(img_height - 1) / (height - 1) if height > 1 else 0

    y, x = np.divmod(np.arange(height * width), width)

Example #15

0

Show file

    def __init__(
        self,
        params: Union[int, list],
        zarr_store: Union[zarr.MemoryStore, zarr.DirectoryStore],
        simulator: Optional[Simulator] = None,
        sync_path: Optional[PathType] = None,
        chunksize: int = 1000,
    ):
        """Initialize Store content dimensions.

        Args:
            params (list of strings or int): List of paramater names.  If int use ['z0', 'z1', ...].
            zarr_store: zarr storage.
            simulator: simulator object.
            sync_path: if specified, it will enable synchronization using file locks (files will be
                stored in the given path). Must be accessible to all processes working on the store
                and the underlying filesystem must support file locking.
            chunksize: the parameters and simulation output will be stored as arrays with the
                specified chunk size along the sample dimension (a single chunk will be used for the
                other dimensions).
        """
        self._zarr_store = zarr_store
        self._simulator = simulator

        if isinstance(params, int):
            params = ["z%i" % i for i in range(params)]
        self.params = params

        synchronizer = zarr.ProcessSynchronizer(
            sync_path) if sync_path else None
        self._root = zarr.group(store=self.zarr_store,
                                synchronizer=synchronizer)

        log.debug("  params = %s" % str(params))

        if set(["samples", "metadata"]) == set(self._root.keys()):
            print("Loading existing store.")
            self._update()
        elif len(self._root.keys()) == 0:
            print("Creating new store.")

            # TODO: Remove
            #            log.debug("Loading existing store.")
            #            self._update()
            #        elif len(self._root.keys()) == 0:
            #            log.debug("Creating new store.")

            self._setup_new_zarr_store(len(self.params),
                                       simulator.sim_shapes,
                                       self._root,
                                       chunksize=chunksize)
            log.debug("  sim_shapes = %s" % str(simulator.sim_shapes))
        else:
            raise KeyError(
                "The zarr storage is corrupted. It should either be empty or only have the keys ['samples', 'metadata']."
            )

        # a second layer of synchronization is required to grow the store
        self._lock = None
        if sync_path is not None:
            self._setup_lock(sync_path)

Example #16

0

Show file

File: select_agents.py Project: moridaiki/l5kit

def select_agents(
    input_folder: str,
    th_agent_prob: float,
    th_history_num_frames: int,
    th_future_num_frames: int,
    th_yaw_degree: float,
    th_extent_ratio: float,
    th_movement: float,
    th_distance_av: float,
    num_workers: int,
) -> None:
    """
    Filter agents from zarr INPUT_FOLDER according to multiple thresholds and store a boolean array of the same shape.
    """
    assert th_future_num_frames > 0

    # ===== LOAD
    dm = LocalDataManager()
    input_folder = dm.require(input_folder)

    zarr_dataset = ChunkedStateDataset(path=input_folder)
    zarr_dataset.open()

    output_group = f"{th_history_num_frames}_{th_future_num_frames}_{th_agent_prob}"
    if "agents_mask" in zarr_dataset.root and f"agents_mask/{output_group}" in zarr_dataset.root:
        raise FileExistsError(
            f"{output_group} exists already! only one is supported for now!")

    frame_index_intervals = zarr_dataset.scenes["frame_index_interval"]

    # build a partial with all args except the first one (will be passed by threads)
    get_valid_agents_partial = partial(
        get_valid_agents,
        dataset=zarr_dataset,
        th_frames_past=th_history_num_frames,
        th_frames_future=th_future_num_frames,
        th_agent_filter_probability_threshold=th_agent_prob,
        th_yaw_degree=th_yaw_degree,
        th_extent_ratio=th_extent_ratio,
        th_movement=th_movement,
        th_distance_av=th_distance_av,
    )

    try:
        root = zarr.open(zarr_dataset.path, mode="a")
        root.create_group("agents_mask")
    except ValueError:
        pass  # group is already there

    agents_mask = zarr.open_array(
        str(Path(zarr_dataset.path) / "agents_mask" / output_group),
        mode="w",
        shape=(len(zarr_dataset.agents), ),
        chunks=(10000, ),
        dtype=np.bool,
        synchronizer=zarr.ProcessSynchronizer(
            f"/tmp/ag_mask_{str(uuid4())}.sync"),
    )

    report: Counter = Counter()
    print("starting pool...")
    with Pool(num_workers) as pool:
        tasks = tqdm(
            enumerate(
                pool.imap_unordered(get_valid_agents_partial,
                                    frame_index_intervals)))
        for idx, (mask, count, agents_range) in tasks:
            report += count
            agents_mask[agents_range[0]:agents_range[1]] = mask
            tasks.set_description(f"{idx + 1}/{len(frame_index_intervals)}")
        print("collecting results..")

    assert (report["total_agent_frames"] == report["selected_agent_frames"] +
            report["total_reject"]), "something went REALLY wrong"

    agents_cfg = {
        "th_history_num_frames": th_history_num_frames,
        "th_future_num_frames": th_future_num_frames,
        "th_agent_filter_probability_threshold": th_agent_prob,
        "th_yaw_degree": th_yaw_degree,
        "th_extent_ratio": th_extent_ratio,
        "th_movement": th_movement,
        "th_distance_av": th_distance_av,
    }
    # print report
    pp = pprint.PrettyPrinter(indent=4)
    print(f"start report for {input_folder}")
    pp.pprint({**agents_cfg, **report})
    print(f"end report for {input_folder}")
    print("==============================")

Example #17

0

Show file

incr = 1000
stop = len(files)
ranges = list(range(start, stop, incr))
for i in tqdm_notebook(ranges):
    print(f'Processing {i}')
    d = []
    for file in files[i:i + incr]:
        print(file)
        d.append(process_float(file))

    results = dask.compute(*d)

    t = xr.concat(results, dim='N_PROF', coords='minimal')
    t = t.chunk({'N_PROF': 10000, 'N_LEVELS': 3000})
    print(f'Finished concatenating dataset')

    numcodecs.blosc.use_threads = False
    synchronizer = zarr.ProcessSynchronizer('../../argozarr/argodask2.sync')
    #compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2)
    zarr_path = '../../argozarr/argo_dask2.zarr'
    #encoding = {vname: {'compressor': compressor} for vname in t.variables}
    d = t.to_zarr(zarr_path,
                  mode='a',
                  synchronizer=synchronizer,
                  compute=True,
                  append_dim='N_PROF')
    print('Appending Done!')
    client.restart()

cluster.close()