def __init__(self, hparams, dense): super().__init__() self.hparams = hparams self.dense = dense self.dummy = torch.nn.Linear(1, 1) # output out_path = self._get_output_path() init_shape = chunks = (self.hparams.batch_size * 2, self.dense.get_dim()) self.batch_idx = 0 # self.fout = zarr.open(zarr.ZipStore(out_path), mode="w") self.fout = zarr.open(out_path, mode="w") q_sync = zarr.ProcessSynchronizer("sync/query.sync") pd_sync = zarr.ProcessSynchronizer("sync/doc_pos.sync") nd_sync = zarr.ProcessSynchronizer("sync/doc_neg.sync") self.out_query = self.fout.zeros( "query", shape=init_shape, chunks=chunks, synchronizer=q_sync, ) self.out_doc_pos = self.fout.zeros( "doc_pos", shape=init_shape, chunks=chunks, synchronizer=pd_sync, ) self.out_doc_neg = self.fout.zeros( "doc_neg", shape=init_shape, chunks=chunks, synchronizer=nd_sync, )
def xarr_to_zarr(xarr: xr.Dataset, outputpth: str, attrs: dict = None): """ Takes in an xarray Dataset and pushes it to zarr store. Must be run once to generate new store. Successive runs append, see mode flag Parameters ---------- xarr xarray Dataset to write to zarr outputpth path to the zarr rootgroup folder to write attrs optional attribution to write to zarr Returns ------- str path to the zarr group """ # grpname = str(datetime.now().strftime('%H%M%S%f')) if attrs is not None: xarr.attrs = attrs if not os.path.exists(outputpth): xarr.to_zarr(outputpth, mode='w-', compute=False) else: sync = zarr.ProcessSynchronizer(outputpth + '.sync') xarr.to_zarr(outputpth, mode='a', synchronizer=sync, compute=False, append_dim='time') return outputpth
def main(args=None): args = args if args is not None else sys.argv[1:] if len(args) != 2: print(f'Usage: {sys.argv[0]} OUTPUT.zarr (INPUT.nc | INPUT.dir)') sys.exit(2) output_dir = args[0] input_file = args[1] if os.path.isdir(input_file): input_dir = input_file input_files = list(os.listdir(input_dir)) # Shuffle files for i in range(len(input_files)): i1 = random.randint(0, len(input_files) - 1) i2 = random.randint(0, len(input_files) - 1) t = input_files[i1] input_files[i1] = input_files[i2] input_files[i2] = t for input_file in input_files: print(f'processing {input_file}') subprocess.run([ sys.executable, sys.argv[0], output_dir, os.path.join(input_dir, input_file) ]) return synchronizer = zarr.ProcessSynchronizer(output_dir + '.sync') input_ds = xr.open_dataset(input_file, decode_times=False) dropped_vars = set( input_ds.data_vars.keys()) - {"analysed_sst", "analysis_error"} input_ds = input_ds.drop(dropped_vars) if not os.path.isdir(output_dir): compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) encoding = dict() for var_name in input_ds.data_vars: new_var = input_ds[var_name] chunks = new_var.shape encoding[var_name] = {'compressor': compressor, 'chunks': chunks} input_ds.to_zarr(output_dir, encoding=encoding, synchronizer=synchronizer) print(f'written {input_file} to {output_dir}') else: # cube_ds = xr.open_zarr(output_dir, synchronizer=synchronizer) # cube_ds = xr.concat([cube_ds, input_ds], dim='time') # cube_ds.close() root_group = zarr.open(output_dir, mode='a', synchronizer=synchronizer) for var_name, var_array in root_group.arrays(): if var_name in input_ds: var = input_ds[var_name] if 'time' in var.dims: if var_name == 'time': print('time:', var, var.values) axis = var.dims.index('time') # Note: all append operations are forced to be sequential! # See https://github.com/zarr-developers/zarr/issues/75 var_array.append(var, axis=axis) print(f'appended {input_file} to {output_dir}')
def __append_vars(ds, store, dim, mode='serial'): print("Append vars") dataset = __nc_open(ds) store[dim].append(dataset[dim]) if mode == 'serial': for name in dataset.variables.keys(): __append_var(ds, store, name, dim) elif mode == 'processes': with ProcessPoolExecutor(max_workers=8) as executor: syncro = zarr.ProcessSynchronizer(SHARED + 'ntz.sync') for name in dataset.variables.keys(): executor.submit(__append_var, ds, store, name, dim, syncro) elif mode == 'threads': with ThreadPoolExecutor(max_workers=8) as executor: syncro = zarr.ThreadSynchronizer() for name in dataset.variables.keys(): executor.submit(__append_var, ds, store, name, dim, syncro) else: raise ValueError('the mode %s is not valid.' % mode)
def _write_new_dataset_rootgroup(self, xarr: xr.Dataset, var_name: str, dims_of_arrays: dict, chunksize: tuple, startingshp: tuple): """ Create a new rootgroup array from the input xarray Dataarray. Use startingshp to resize the array to the expected shape of the array after ALL writes. This must be the first write if there are multiple distributed writes. Parameters ---------- xarr data to write to zarr var_name variable name dims_of_arrays where keys are array names and values list of dims/shape. Example: 'beampointingangle': [['time', 'sector', 'beam'], (5000, 3, 400)] chunksize chunk shape used to create the zarr array startingshp desired shape for the rootgroup array, might be modified later for total beams if necessary. if finalsize is None (the case when this is not the first write in a set of distributed writes) this is still returned but not used. """ sync = None if self.zarr_path: sync = zarr.ProcessSynchronizer(self.zarr_path + '.sync') newarr = self.rootgroup.create_dataset(var_name, shape=dims_of_arrays[var_name][1], chunks=chunksize, dtype=xarr[var_name].dtype, synchronizer=sync, fill_value=self._get_arr_nodatavalue(xarr[var_name].dtype)) newarr[:] = xarr[var_name].values newarr.resize(startingshp)
def open(self): """ Open the zarr data store, will create a new one if it does not exist. Get all the existing array names. """ sync = zarr.ProcessSynchronizer(self.zarr_path + '.sync') self.rootgroup = zarr.open(self.zarr_path, mode='a', synchronizer=sync) self.get_array_names()
def test_run_simulator_with_processes_and_zarr_directory_store(): """ If the store is on disk (here a Zarr DirectoryStore), collect_in_memory can be set to False (but synchronization needs to be employed). """ cluster = LocalCluster(n_workers=2, processes=True, threads_per_worker=1) simulator = Simulator(model, sim_shapes=dict(x=(10, )), cluster=cluster) with tempfile.TemporaryDirectory() as tmpdir: pars = zarr.open(f"{tmpdir}/pars.zarr", shape=(100, 2)) pars[:, :] = np.random.random(pars.shape) synchronizer = zarr.ProcessSynchronizer(path=f"{tmpdir}/x.sync") x = zarr.open(f"{tmpdir}/x.zarr", shape=(100, 10), synchronizer=synchronizer) x[:, :] = 0.0 sims = dict(x=x.oindex) synchronizer = zarr.ProcessSynchronizer( path=f"{tmpdir}/sim_status.sync") sim_status = zarr.open( f"{tmpdir}/sim_status.zarr", shape=(100, ), synchronizer=synchronizer, dtype="int", ) sim_status[:] = np.full(100, SimulationStatus.RUNNING, dtype="int") # the following is non-blocking (it immediately returns) simulator.run( pars=pars, sims=sims, sim_status=sim_status.oindex, indices=np.arange(100, dtype=np.int), collect_in_memory=False, batch_size=20, ) # need to wait for tasks to be completed _wait_for_all_tasks() assert np.all(sim_status[:] == SimulationStatus.FINISHED) assert not np.all(np.isclose(sims["x"][:, :].sum(axis=1), 0.0)) simulator.client.close() cluster.close()
def reload_zarr_records(pth: str, skip_dask: bool = False, sort_by: str = None): """ After writing new data to the zarr data store, you need to refresh the xarray Dataset object so that it sees the changes. We do that here by just re-running open_zarr. All the keyword arguments set to False are there to correctly read the saved zarr arrays. Mask_and_scale i've yet to configure properly, it will replace values equal to the fill_value attribute with NaN. Even when fill_value is non-zero, it seems to replace zeros with NaN. Setting it to false prevents this. You can read more here: http://xarray.pydata.org/en/stable/generated/xarray.open_zarr.html If you are running this outside of the normal dask-enabled workflow, self.client will be None and you will not have the distributed sync object. I do this with reading attributes from the zarr datastore where I just need to open for a minute to get the attributes. Returns ------- pth string, path to xarray Dataset stored as zarr datastore skip_dask if True, skip the dask process synchronizer as you are not running dask distributed sort_by optional, will sort by the dimension provided, if provided (ex: 'time') """ if os.path.exists(pth): sync = zarr.ProcessSynchronizer(pth + '.sync') if not skip_dask: data = xr.open_zarr(pth, synchronizer=sync, consolidated=False, mask_and_scale=False, decode_coords=False, decode_times=False, decode_cf=False, concat_characters=False) else: data = xr.open_zarr(pth, synchronizer=None, consolidated=False, mask_and_scale=False, decode_coords=False, decode_times=False, decode_cf=False, concat_characters=False) if sort_by: return data.sortby(sort_by) else: return data else: print('Unable to reload, no paths found: {}'.format(pth)) return None
def to_zarr(filename, data, window, chunks, root=None): """ Writes data to a zarr file Args: filename (str): The output file name. data (ndarray): The data to write. window (namedtuple): A ``rasterio.window.Window`` object. chunks (int or tuple): The ``zarr`` chunks. root (Optional[object]): The ``zarr`` root. Returns: ``str`` """ p = Path(filename) f_base = p.name.split('.')[0] d_name = p.parent sub_dir = d_name.joinpath('sub_tmp_') zarr_file = sub_dir.joinpath('data.zarr').as_posix() # sub_dir.mkdir(parents=True, exist_ok=True) if not root: root = zarr.open(zarr_file, mode='r+') group_name = '{BASE}_y{Y:09d}_x{X:09d}_h{H:09d}_w{W:09d}'.format(BASE=f_base, Y=window.row_off, X=window.col_off, H=window.height, W=window.width) group = root.create_group(group_name) synchronizer = zarr.ProcessSynchronizer('data.sync') z = group.array('data', data, compressor=compressor, dtype=data.dtype.name, chunks=chunks, synchronizer=synchronizer) group.attrs['row_off'] = window.row_off group.attrs['col_off'] = window.col_off group.attrs['height'] = window.height group.attrs['width'] = window.width return zarr_file
def __init__( self, params: Union[int, list], zarr_store: Union[zarr.MemoryStore, zarr.DirectoryStore], simulator=None, sync_path: Optional[PathType] = None, ): """Initialize Store content dimensions. Args: params (list of strings or int): List of paramater names. If int use ['z0', 'z1', ...]. zarr_store: zarr storage. sync_path: path to the cache lock files. Must be accessible to all processes working on the cache. """ self._zarr_store = zarr_store self._simulator = simulator if isinstance(params, int): params = ["z%i" % i for i in range(params)] self.params = params synchronizer = zarr.ProcessSynchronizer(sync_path) if sync_path else None self._root = zarr.group(store=self.zarr_store, synchronizer=synchronizer) logging.debug(" params = %s" % str(params)) if set(["samples", "metadata"]) == set(self._root.keys()): logging.info("Loading existing store.") self._update() elif len(self._root.keys()) == 0: logging.info("Creating new store.") self._setup_new_zarr_store( len(self.params), simulator.sim_shapes, self._root ) logging.debug(" sim_shapes = %s" % str(simulator.sim_shapes)) else: raise KeyError( "The zarr storage is corrupted. It should either be empty or only have the keys ['samples', 'metadata']." ) self._lock = None if sync_path is not None: self._setup_lock(sync_path)
def __set_dims(ds, group, mode): dataset = __nc_open(ds) if mode == 'serial': for name in dataset.variables.keys(): __set_dim(ds, group, name) elif mode == 'processes': with ProcessPoolExecutor(max_workers=8) as executor: syncro = zarr.ProcessSynchronizer(SHARED + 'ntz.sync') for name in dataset.variables.keys(): executor.submit(__set_dim, ds, group, name, syncro) elif mode == 'threads': with ThreadPoolExecutor(max_workers=8) as executor: syncro = zarr.ThreadSynchronizer() for name in dataset.variables.keys(): executor.submit(__set_dim, ds, group, name, syncro) else: raise ValueError('the mode %s is not valid.' % mode)
def my_xarr_add_attribute(attrs: dict, outputpth: str): """ Add the provided attrs dict to the existing attribution of the zarr instance at outputpth Parameters ---------- attrs dictionary of combined attributes from xarray datasets, None if no attributes exist outputpth path to zarr group to either be created or append to Returns ------- str path to the final zarr group """ # mode 'a' means read/write, create if doesnt exist sync = zarr.ProcessSynchronizer(outputpth + '.sync') rootgroup = zarr.open(outputpth, mode='a', synchronizer=sync) _my_xarr_to_zarr_writeattributes(rootgroup, attrs) return outputpth
def select_agents( zarr_dataset: ChunkedDataset, th_agent_prob: float, th_yaw_degree: float, th_extent_ratio: float, th_distance_av: float, ) -> None: """ Filter agents from zarr INPUT_FOLDER according to multiple thresholds and store a boolean array of the same shape. """ agents_mask_path = Path(zarr_dataset.path) / f"agents_mask/{th_agent_prob}" if agents_mask_path.exists(): raise FileExistsError( f"{th_agent_prob} exists already! only one is supported!") frame_index_intervals = zarr_dataset.scenes["frame_index_interval"] # build a partial with all args except the first one (will be passed by threads) get_valid_agents_partial = partial( get_valid_agents, dataset=zarr_dataset, th_agent_filter_probability_threshold=th_agent_prob, th_yaw_degree=th_yaw_degree, th_extent_ratio=th_extent_ratio, th_distance_av=th_distance_av, ) try: root = zarr.open(zarr_dataset.path, mode="a") root.create_group("agents_mask") except ValueError: pass # group is already there agents_mask = zarr.open_array( str(agents_mask_path), mode="w", shape=(len(zarr_dataset.agents), 2), chunks=(10000, ), dtype=np.uint32, synchronizer=zarr.ProcessSynchronizer( Path(gettempdir()) / f"ag_mask_{str(uuid4())}.sync"), ) report: Counter = Counter() print("starting pool...") with Pool(cpu_count()) as pool: tasks = tqdm( enumerate( pool.imap_unordered(get_valid_agents_partial, frame_index_intervals))) for idx, (mask, count, agents_range) in tasks: report += count agents_mask[agents_range[0]:agents_range[1]] = mask tasks.set_description(f"{idx + 1}/{len(frame_index_intervals)}") print("collecting results..") agents_cfg = { "th_agent_filter_probability_threshold": th_agent_prob, "th_yaw_degree": th_yaw_degree, "th_extent_ratio": th_extent_ratio, "th_distance_av": th_distance_av, } # print report pp = pprint.PrettyPrinter(indent=4) print(f"start report for {zarr_dataset.path}") pp.pprint({**agents_cfg, **report}) future_steps = [0, 10, 30, 50] past_steps = [0, 10, 30, 50] agents_mask_np = np.asarray(agents_mask) table = PrettyTable(field_names=["past/future"] + [str(step) for step in future_steps]) for step_p in tqdm(past_steps, desc="computing past/future table"): row = [step_p] for step_f in future_steps: past_mask = agents_mask_np[:, 0] >= step_p future_mask = agents_mask_np[:, 1] >= step_f row.append(np.sum(past_mask * future_mask)) table.add_row(row) print(table) print(f"end report for {zarr_dataset.path}") print("==============================")
logging_format = '%(asctime)s - %(name)s - %(message)s' logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO, format=logging_format, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger("ProcessData") # radarBase = "/wave/mlp/cwb-ci/Radar/raw/" # satBase = "/wave/mlp/cwb-ci/Satellite/raw/" radarBase = "../../data/cwb-ci/Radar/raw/" satBase = "../../data/cwb-ci/Satellite/raw/" global compressor compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) synchronizer = zarr.ProcessSynchronizer('example.sync') def bilinear_resize(image, height, width): """ `image` is a 2-D numpy array `height` and `width` are the desired spatial dimension of the new 2-D array. """ img_height, img_width = image.shape image = image.ravel() x_ratio = float(img_width - 1) / (width - 1) if width > 1 else 0 y_ratio = float(img_height - 1) / (height - 1) if height > 1 else 0 y, x = np.divmod(np.arange(height * width), width)
def __init__( self, params: Union[int, list], zarr_store: Union[zarr.MemoryStore, zarr.DirectoryStore], simulator: Optional[Simulator] = None, sync_path: Optional[PathType] = None, chunksize: int = 1000, ): """Initialize Store content dimensions. Args: params (list of strings or int): List of paramater names. If int use ['z0', 'z1', ...]. zarr_store: zarr storage. simulator: simulator object. sync_path: if specified, it will enable synchronization using file locks (files will be stored in the given path). Must be accessible to all processes working on the store and the underlying filesystem must support file locking. chunksize: the parameters and simulation output will be stored as arrays with the specified chunk size along the sample dimension (a single chunk will be used for the other dimensions). """ self._zarr_store = zarr_store self._simulator = simulator if isinstance(params, int): params = ["z%i" % i for i in range(params)] self.params = params synchronizer = zarr.ProcessSynchronizer( sync_path) if sync_path else None self._root = zarr.group(store=self.zarr_store, synchronizer=synchronizer) log.debug(" params = %s" % str(params)) if set(["samples", "metadata"]) == set(self._root.keys()): print("Loading existing store.") self._update() elif len(self._root.keys()) == 0: print("Creating new store.") # TODO: Remove # log.debug("Loading existing store.") # self._update() # elif len(self._root.keys()) == 0: # log.debug("Creating new store.") self._setup_new_zarr_store(len(self.params), simulator.sim_shapes, self._root, chunksize=chunksize) log.debug(" sim_shapes = %s" % str(simulator.sim_shapes)) else: raise KeyError( "The zarr storage is corrupted. It should either be empty or only have the keys ['samples', 'metadata']." ) # a second layer of synchronization is required to grow the store self._lock = None if sync_path is not None: self._setup_lock(sync_path)
def select_agents( input_folder: str, th_agent_prob: float, th_history_num_frames: int, th_future_num_frames: int, th_yaw_degree: float, th_extent_ratio: float, th_movement: float, th_distance_av: float, num_workers: int, ) -> None: """ Filter agents from zarr INPUT_FOLDER according to multiple thresholds and store a boolean array of the same shape. """ assert th_future_num_frames > 0 # ===== LOAD dm = LocalDataManager() input_folder = dm.require(input_folder) zarr_dataset = ChunkedStateDataset(path=input_folder) zarr_dataset.open() output_group = f"{th_history_num_frames}_{th_future_num_frames}_{th_agent_prob}" if "agents_mask" in zarr_dataset.root and f"agents_mask/{output_group}" in zarr_dataset.root: raise FileExistsError( f"{output_group} exists already! only one is supported for now!") frame_index_intervals = zarr_dataset.scenes["frame_index_interval"] # build a partial with all args except the first one (will be passed by threads) get_valid_agents_partial = partial( get_valid_agents, dataset=zarr_dataset, th_frames_past=th_history_num_frames, th_frames_future=th_future_num_frames, th_agent_filter_probability_threshold=th_agent_prob, th_yaw_degree=th_yaw_degree, th_extent_ratio=th_extent_ratio, th_movement=th_movement, th_distance_av=th_distance_av, ) try: root = zarr.open(zarr_dataset.path, mode="a") root.create_group("agents_mask") except ValueError: pass # group is already there agents_mask = zarr.open_array( str(Path(zarr_dataset.path) / "agents_mask" / output_group), mode="w", shape=(len(zarr_dataset.agents), ), chunks=(10000, ), dtype=np.bool, synchronizer=zarr.ProcessSynchronizer( f"/tmp/ag_mask_{str(uuid4())}.sync"), ) report: Counter = Counter() print("starting pool...") with Pool(num_workers) as pool: tasks = tqdm( enumerate( pool.imap_unordered(get_valid_agents_partial, frame_index_intervals))) for idx, (mask, count, agents_range) in tasks: report += count agents_mask[agents_range[0]:agents_range[1]] = mask tasks.set_description(f"{idx + 1}/{len(frame_index_intervals)}") print("collecting results..") assert (report["total_agent_frames"] == report["selected_agent_frames"] + report["total_reject"]), "something went REALLY wrong" agents_cfg = { "th_history_num_frames": th_history_num_frames, "th_future_num_frames": th_future_num_frames, "th_agent_filter_probability_threshold": th_agent_prob, "th_yaw_degree": th_yaw_degree, "th_extent_ratio": th_extent_ratio, "th_movement": th_movement, "th_distance_av": th_distance_av, } # print report pp = pprint.PrettyPrinter(indent=4) print(f"start report for {input_folder}") pp.pprint({**agents_cfg, **report}) print(f"end report for {input_folder}") print("==============================")
incr = 1000 stop = len(files) ranges = list(range(start, stop, incr)) for i in tqdm_notebook(ranges): print(f'Processing {i}') d = [] for file in files[i:i + incr]: print(file) d.append(process_float(file)) results = dask.compute(*d) t = xr.concat(results, dim='N_PROF', coords='minimal') t = t.chunk({'N_PROF': 10000, 'N_LEVELS': 3000}) print(f'Finished concatenating dataset') numcodecs.blosc.use_threads = False synchronizer = zarr.ProcessSynchronizer('../../argozarr/argodask2.sync') #compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) zarr_path = '../../argozarr/argo_dask2.zarr' #encoding = {vname: {'compressor': compressor} for vname in t.variables} d = t.to_zarr(zarr_path, mode='a', synchronizer=synchronizer, compute=True, append_dim='N_PROF') print('Appending Done!') client.restart() cluster.close()