def upload(self, results, ds: Dataset, token: dict, progressbar: bool = True): """Batchified upload of results For each tensor batchify based on its chunk and upload If tensor is dynamic then still upload element by element For dynamic tensors, it disable dynamicness and then enables it back Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool Returns ---------- ds: hub.Dataset Uploaded dataset """ for key, value in results.items(): length = ds[key].chunksize[0] value = str_to_int(value, ds.dataset.tokenizer) if length == 0: length = 1 batched_values = batchify(value, length) def upload_chunk(i_batch): i, batch = i_batch batch_length = len(batch) if batch_length != 1: ds[key, i * length:i * length + batch_length] = batch else: ds[key, i * length] = batch[0] index_batched_values = list( zip(list(range(len(batched_values))), batched_values)) # Disable dynamic arrays ds.dataset._tensors[f"/{key}"].disable_dynamicness() list(self.map(upload_chunk, index_batched_values)) # Enable and rewrite shapes if ds.dataset._tensors[f"/{key}"].is_dynamic: ds.dataset._tensors[f"/{key}"].enable_dynamicness() [ ds.dataset._tensors[f"/{key}"].set_shape([i + ds.offset], v) for i, v in enumerate(value) ] ds.commit() return ds
def upload(self, results, ds: Dataset, token: dict, progressbar: bool = True): """Batchified upload of results. For each tensor batchify based on its chunk and upload. If tensor is dynamic then still upload element by element. For dynamic tensors, it disable dynamicness and then enables it back. Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool Returns ---------- ds: hub.Dataset Uploaded dataset """ for key, value in results.items(): chunk = ds[key].chunksize[0] chunk = 1 if chunk == 0 else chunk value = get_value(value) value = str_to_int(value, ds.dataset.tokenizer) num_chunks = math.ceil(len(value) / (chunk * self.workers)) length = num_chunks * chunk if self.workers != 1 else len(value) batched_values = batchify(value, length) def upload_chunk(i_batch): i, batch = i_batch length = len(batch) slice_ = slice(i * length, (i + 1) * length) ds[key, slice_] = batch index_batched_values = list( zip(list(range(len(batched_values))), batched_values)) # Disable dynamic arrays ds.dataset._tensors[f"/{key}"].disable_dynamicness() list(self.map(upload_chunk, index_batched_values)) offset = ds.indexes[ 0] # here ds.indexes will always be a contiguous list as obtained after slicing # Enable and rewrite shapes if ds.dataset._tensors[f"/{key}"].is_dynamic: ds.dataset._tensors[f"/{key}"].enable_dynamicness() ds.dataset._tensors[f"/{key}"].set_shape( [slice(offset, offset + len(value))], value) ds.commit() return ds
def upload(self, results, url: str, token: dict, progressbar: bool = True): """Batchified upload of results For each tensor batchify based on its chunk and upload If tensor is dynamic then still upload element by element Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool Returns ---------- ds: hub.Dataset Uploaded dataset """ shape = (len(list(results.values())[0]), ) ds = Dataset( url, mode="w", shape=shape, # unkownn schema=self.schema, token=token, cache=False, ) tasks = [] for key, value in results.items(): length = ds[key].chunksize[0] batched_values = batchify(value, length) chunk_id = list(range(len(batched_values))) index_batched_values = list(zip(chunk_id, batched_values)) results = [ self.upload_chunk.remote(el, key=key, ds=ds) for el in index_batched_values ] tasks.extend(results) ray.get(tasks) ds.commit() return ds
def upload( self, results, url: str, token: dict, progressbar: bool = True, public: bool = True, ): """Batchified upload of results. For each tensor batchify based on its chunk and upload. If tensor is dynamic then still upload element by element. Parameters ---------- dataset: hub.Dataset Dataset object that should be written to results: Output of transform function progressbar: bool public: bool, optional only applicable if using hub storage, ignored otherwise setting this to False allows only the user who created it to access the dataset and the dataset won't be visible in the visualizer to the public Returns ---------- ds: hub.Dataset Uploaded dataset """ if len(list(results.values())) == 0: shape = (0, ) else: shape = (len(list(results.values())[0]), ) ds = Dataset( url, mode="w", shape=shape, schema=self.schema, token=token, cache=False, public=public, ) tasks = [] for key, value in results.items(): length = ds[key].chunksize[0] value = get_value(value) value = str_to_int(value, ds.tokenizer) batched_values = batchify(value, length) chunk_id = list(range(len(batched_values))) index_batched_values = list(zip(chunk_id, batched_values)) ds._tensors[f"/{key}"].disable_dynamicness() results = [ self.upload_chunk.remote(el, key=key, ds=ds) for el in index_batched_values ] tasks.extend(results) results = ray.get(tasks) self.set_dynamic_shapes(results, ds) ds.commit() return ds