def upload(self, results, ds: Dataset, token: dict, progressbar: bool = True): """Batchified upload of results For each tensor batchify based on its chunk and upload If tensor is dynamic then still upload element by element For dynamic tensors, it disable dynamicness and then enables it back Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool Returns ---------- ds: hub.Dataset Uploaded dataset """ for key, value in results.items(): length = ds[key].chunksize[0] value = str_to_int(value, ds.dataset.tokenizer) if length == 0: length = 1 batched_values = batchify(value, length) def upload_chunk(i_batch): i, batch = i_batch batch_length = len(batch) if batch_length != 1: ds[key, i * length:i * length + batch_length] = batch else: ds[key, i * length] = batch[0] index_batched_values = list( zip(list(range(len(batched_values))), batched_values)) # Disable dynamic arrays ds.dataset._tensors[f"/{key}"].disable_dynamicness() list(self.map(upload_chunk, index_batched_values)) # Enable and rewrite shapes if ds.dataset._tensors[f"/{key}"].is_dynamic: ds.dataset._tensors[f"/{key}"].enable_dynamicness() [ ds.dataset._tensors[f"/{key}"].set_shape([i + ds.offset], v) for i, v in enumerate(value) ] ds.commit() return ds
def upload(self, results, ds: Dataset, token: dict, progressbar: bool = True): """Batchified upload of results. For each tensor batchify based on its chunk and upload. If tensor is dynamic then still upload element by element. For dynamic tensors, it disable dynamicness and then enables it back. Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool Returns ---------- ds: hub.Dataset Uploaded dataset """ for key, value in results.items(): chunk = ds[key].chunksize[0] chunk = 1 if chunk == 0 else chunk value = get_value(value) value = str_to_int(value, ds.dataset.tokenizer) num_chunks = math.ceil(len(value) / (chunk * self.workers)) length = num_chunks * chunk if self.workers != 1 else len(value) batched_values = batchify(value, length) def upload_chunk(i_batch): i, batch = i_batch length = len(batch) slice_ = slice(i * length, (i + 1) * length) ds[key, slice_] = batch index_batched_values = list( zip(list(range(len(batched_values))), batched_values)) # Disable dynamic arrays ds.dataset._tensors[f"/{key}"].disable_dynamicness() list(self.map(upload_chunk, index_batched_values)) offset = ds.indexes[ 0] # here ds.indexes will always be a contiguous list as obtained after slicing # Enable and rewrite shapes if ds.dataset._tensors[f"/{key}"].is_dynamic: ds.dataset._tensors[f"/{key}"].enable_dynamicness() ds.dataset._tensors[f"/{key}"].set_shape( [slice(offset, offset + len(value))], value) ds.commit() return ds
def load(tag): """ Load a dataset from repository using given tag Args: tag: string using {username}/{dataset} format or file system, s3://, gcs:// Notes ------ It will try to load using old version and fall off on newer version """ try: ds = load_v0(tag) logger.warning( "Deprecated Warning: Given dataset is using deprecated format v0.x. Please convert to v1.x version upon availability." ) return ds except ImportError: raise DaskModuleNotInstalledException except HubDatasetNotFoundException: raise except Exception as e: pass # logger.warning(traceback.format_exc() + str(e)) return Dataset(tag)
def store_shard(self, ds_in: Iterable, ds_out: Dataset, offset: int, token=None): """ Takes a shard of iteratable ds_in, compute and stores in DatasetView """ def _func_argd(item): if isinstance(item, DatasetView) or isinstance(item, Dataset): item = item.numpy() result = self.call_func( 0, item ) # If the iterable obtained from iterating ds_in is a list, it is not treated as list return result ds_in = list(ds_in) results = self.map( _func_argd, ds_in, ) results = self._unwrap(results) results = self.map(lambda x: self._flatten_dict(x, schema=self.schema), results) results = list(results) results = self._split_list_to_dicts(results) results_values = list(results.values()) if len(results_values) == 0: return 0 n_results = len(results_values[0]) if n_results == 0: return 0 additional = max(offset + n_results - ds_out.shape[0], 0) ds_out.append_shape(additional) self.upload( results, ds_out[offset:offset + n_results], token=token, ) return n_results
def store_shard(self, ds_in: Iterable, ds_out: Dataset, offset: int, token=None): """ Takes a shard of iteratable ds_in, compute and stores in DatasetView """ def _func_argd(item): return self._func(item, **self.kwargs) ds_in = list(ds_in) results = self.map( _func_argd, ds_in, ) results = self._unwrap(results) results = self.map(lambda x: self._flatten_dict(x, schema=self.schema), results) results = list(results) results = self._split_list_to_dicts(results) results_values = list(results.values()) if len(results_values) == 0: return 0 n_results = len(results_values[0]) if n_results == 0: return 0 additional = max(offset + n_results - ds_out.shape[0], 0) ds_out.append_shape(additional) self.upload( results, ds_out[offset:offset + n_results], token=token, ) return n_results
def create_dataset(self, url, length=None, token=None): """Helper function to creat a dataset""" shape = (length, ) ds = Dataset( url, mode="w", shape=shape, schema=self.schema, token=token, fs=zarr.storage.MemoryStore() if "tmp" in url else None, cache=False, ) return ds
def create_dataset(self, url: str, length: int = None, token: dict = None, public: bool = True): """Helper function to creat a dataset""" shape = (length, ) ds = Dataset( url, mode="w", shape=shape, schema=self.schema, token=token, fs=zarr.storage.MemoryStore() if "tmp" in url else None, cache=False, public=public, ) return ds
def test_logs(): logs = Dataset( schema={ "train_acc": float, "train_loss": float, "val_acc": float, "val_loss": float, }, shape=(1, ), url="./data/test/models/logs", mode="w", ) metrics_1 = { "val_loss": 1.21, "val_acc": 0.5, "train_loss": 2.4, "train_acc": 0.75 } for key, value in metrics_1.items(): logs[key] = value assert np.isclose(logs["val_loss"].numpy(), 1.21) assert np.isclose(logs["train_loss"].numpy(), 2.4)