def test_dataset_utils(): with pytest.raises(TypeError): slice_split([5.3]) with pytest.raises(IndexError): slice_extract_info(5, 3) with pytest.raises(ValueError): slice_extract_info(slice(2, 10, -2), 3) with pytest.raises(IndexError): slice_extract_info(slice(20, 100), 3) with pytest.raises(IndexError): slice_extract_info(slice(1, 20), 3) with pytest.raises(IndexError): slice_extract_info(slice(4, 1), 10) slice_extract_info(slice(None, 10), 20) slice_extract_info(slice(20, None), 50)
def __setitem__(self, slice_, value) -> None: if not isinstance(slice_, Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = slice_list or [slice(0, self.num_samples)] if isinstance(slice_list[0], int): # if integer it assigns the data to the corresponding dataset slice_list, shard_id = self.slicing(slice_list) slice_ = slice_list + [subpath] if subpath else slice_list self.datasets[shard_id][slice_] = value else: # if slice it finds all the corresponding datasets and assigns slices of the value one by one cur_index = slice_list[0].start or 0 cur_index = cur_index + self.num_samples if cur_index < 0 else cur_index cur_index = max(cur_index, 0) start_index = cur_index stop_index = slice_list[0].stop or self.num_samples stop_index = min(stop_index, self.num_samples) while cur_index < stop_index: shard_id, offset = self.identify_shard(cur_index) end_index = min(offset + len(self.datasets[shard_id]), stop_index) cur_slice_list = [ slice(cur_index - offset, end_index - offset) ] + slice_list[1:] current_slice = ( cur_slice_list + [subpath] if subpath else cur_slice_list ) self.datasets[shard_id][current_slice] = value[ cur_index - start_index : end_index - start_index ] cur_index = end_index
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds_view = ds[5:15] >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image """ # handling strings and bytes assign_value = value assign_value = str_to_int(assign_value, self.dataset.tokenizer) if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = [0] + slice_list if self.squeeze_dim else slice_list if not subpath: raise ValueError("Can't assign to dataset sliced without subpath") elif not slice_list: slice_ = (self.offset if self.num_samples == 1 else slice( self.offset, self.offset + self.num_samples)) self.dataset._tensors[subpath][ slice_] = assign_value # Add path check else: num, ofs = (slice_extract_info(slice_list[0], self.num_samples) if isinstance(slice_list[0], slice) else (1, slice_list[0])) slice_list[0] = (slice(ofs + self.offset, ofs + self.offset + num) if num > 1 else ofs + self.offset) self.dataset._tensors[subpath][slice_list] = assign_value
def __getitem__(self, slice_): """| Gets a slice or slices from tensorview | Usage: >>> images_tensorview = ds["image"] >>> return images_tensorview[7, 0:1920, 0:1080, 0:3].compute() # returns numpy array of 7th image """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) slice_ = self.slice_fill(slice_) subpath, slice_list = slice_split(slice_) if subpath: raise ValueError("Can't slice a Tensor with string") else: new_nums = self.nums.copy() new_offsets = self.offsets.copy() if len(new_nums) < len(slice_list): new_nums.extend([None] * (len(slice_list) - len(new_nums))) new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) for i in range(len(slice_list)): slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) for i in range(len(slice_list), len(new_nums)): cur_slice = (slice(new_offsets[i], new_offsets[i] + new_nums[i]) if new_nums[i] > 1 else new_offsets[i]) slice_list.append(cur_slice) return TensorView(dataset=self.dataset, subpath=self.subpath, slice_=slice_list)
def __getitem__(self, slice_): if not isinstance(slice_, Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = slice_list or [slice(0, self.num_samples)] if isinstance(slice_list[0], int): # if integer it fetches the data from the corresponding dataset slice_list, shard_id = self.slicing(slice_list) slice_ = slice_list + [subpath] if subpath else slice_list return self.datasets[shard_id][slice_] else: # if slice it finds all the corresponding datasets included in the slice and generates tensorviews or datasetviews (depending on slice) # these views are stored in a ComputeList, calling compute on which will fetch data from all corresponding datasets and return a single result results = [] cur_index = slice_list[0].start or 0 cur_index = cur_index + self.num_samples if cur_index < 0 else cur_index cur_index = max(cur_index, 0) stop_index = slice_list[0].stop or self.num_samples stop_index = min(stop_index, self.num_samples) while cur_index < stop_index: shard_id, offset = self.identify_shard(cur_index) end_index = min(offset + len(self.datasets[shard_id]), stop_index) cur_slice_list = [ slice(cur_index - offset, end_index - offset) ] + slice_list[1:] current_slice = ( cur_slice_list + [subpath] if subpath else cur_slice_list ) results.append(self.datasets[shard_id][current_slice]) cur_index = end_index return ComputeList(results)
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds["image", 5, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") >>> images = ds["image"] >>> image = images[5] >>> image[0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") """ assign_value = get_value(value) # handling strings and bytes assign_value = str_to_int(assign_value, self.tokenizer) if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if not subpath: raise ValueError("Can't assign to dataset sliced without subpath") elif subpath not in self.keys: raise KeyError(f"Key {subpath} not found in the dataset") if not slice_list: self._tensors[subpath][:] = assign_value else: self._tensors[subpath][slice_list] = assign_value
def __getitem__(self, slice_): """| Get an item to be computed without iterating on the whole dataset. | Creates a dataset view, then a temporary dataset to apply the transform. Parameters: ---------- slice_: slice Gets a slice or slices from dataset """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if len(slice_list) == 0: slice_list = [slice(None, None, None)] num, ofs = slice_extract_info(slice_list[0], self.shape[0]) ds_view = DatasetView( dataset=self._ds, num_samples=num, offset=ofs, squeeze_dim=isinstance(slice_list[0], int), ) path = posixpath.expanduser("~/.activeloop/tmparray") new_ds = self.store(path, length=num, ds=ds_view, progressbar=False) index = 1 if len(slice_) > 1 else 0 slice_[index] = (slice(None, None, None) if not isinstance(slice_list[0], int) else 0 ) # Get all shape dimension since we already sliced return new_ds[slice_]
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> images_tensorview = ds["image"] >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image """ # handling strings and bytes assign_value = value assign_value = str_to_int(assign_value, self.dataset.tokenizer) if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) slice_ = self.slice_fill(slice_) subpath, slice_list = slice_split(slice_) if subpath: raise ValueError( "Can't slice a Tensor with multiple slices without subpath") else: new_nums = self.nums.copy() new_offsets = self.offsets.copy() if len(new_nums) < len(slice_list): new_nums.extend([None] * (len(slice_list) - len(new_nums))) new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) for i in range(len(slice_list)): slice_list[i] = self._combine(slice_[i], new_nums[i], new_offsets[i]) for i in range(len(slice_list), len(new_nums)): cur_slice = (slice(new_offsets[i], new_offsets[i] + new_nums[i]) if new_nums[i] > 1 else new_offsets[i]) slice_list.append(cur_slice) self.dataset._tensors[self.subpath][slice_list] = value
def __getitem__(self, slice_): """| Gets a slice or slices from tensorview | Usage: >>> images_tensorview = ds["image"] >>> return images_tensorview[7, 0:1920, 0:1080, 0:3].compute() # returns numpy array of 7th image """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) slice_ = self.slice_fill(slice_) subpath, slice_list = slice_split(slice_) new_nums = self.nums.copy() new_offsets = self.offsets.copy() if isinstance(self.indexes, list): new_indexes = self.indexes[slice_list[0]] if self.is_contiguous and new_indexes: new_indexes = slice(new_indexes[0], new_indexes[-1] + 1) elif isinstance(self.indexes, int): new_indexes = self.indexes else: ofs = self.indexes.start or 0 num = self.indexes.stop - ofs if self.indexes.stop else None new_indexes = self._combine(slice_list[0], num, ofs) slice_list[0] = new_indexes # new_indexes = self.indexes[] if len(new_nums) < len(slice_list): new_nums.extend([None] * (len(slice_list) - len(new_nums))) new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) for i in range(1, len(slice_list)): slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) for i in range(len(slice_list), len(new_nums)): cur_slice = ( slice(new_offsets[i], new_offsets[i] + new_nums[i]) if not self.squeeze_dims[i] else new_offsets[i] ) slice_list.append(cur_slice) if subpath or ( len(slice_list) > len(self.nums) and isinstance(self.dtype, objv.Sequence) ): objectview = objv.ObjectView( dataset=self.dataset, subpath=self.subpath + subpath, slice_=slice_list, lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() else: tensorview = TensorView( dataset=self.dataset, subpath=self.subpath, slice_=slice_list, lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute()
def __getitem__(self, slice_): """| Gets a slice or slices from DatasetView | Usage: >>> ds_view = ds[5:15] >>> return ds_view["image", 7, 0:1920, 0:1080, 0:3].compute() # returns numpy array of 12th image """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = [0] + slice_list if self.squeeze_dim else slice_list if not subpath: if len(slice_list) > 1: raise ValueError( "Can't slice a dataset with multiple slices without subpath" ) num, ofs = slice_extract_info(slice_list[0], self.num_samples) return DatasetView( dataset=self.dataset, num_samples=num, offset=ofs + self.offset, squeeze_dim=isinstance(slice_list[0], int), ) elif not slice_list: slice_ = slice(self.offset, self.offset + self.num_samples) if subpath in self.dataset._tensors.keys(): return TensorView( dataset=self.dataset, subpath=subpath, slice_=slice_, squeeze_dims=[True] if self.squeeze_dim else [], ) return self._get_dictionary(self.dataset, subpath, slice=slice_) else: num, ofs = slice_extract_info(slice_list[0], self.num_samples) slice_list[0] = ( ofs + self.offset if num == 1 else slice(ofs + self.offset, ofs + self.offset + num) ) if subpath in self.dataset._tensors.keys(): return TensorView( dataset=self.dataset, subpath=subpath, slice_=slice_list, squeeze_dims=[True] if self.squeeze_dim else [], ) if len(slice_list) > 1: raise ValueError("You can't slice a dictionary of Tensors") return self._get_dictionary(subpath, slice_list[0])
def __getitem__(self, slice_): """| Gets a slice or slices from dataset | Usage: >>> return ds["image", 5, 0:1920, 0:1080, 0:3].compute() # returns numpy array >>> images = ds["image"] >>> return images[5].compute() # returns numpy array >>> images = ds["image"] >>> image = images[5] >>> return image[0:1920, 0:1080, 0:3].compute() """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if not subpath: if len(slice_list) > 1: raise ValueError( "Can't slice a dataset with multiple slices without subpath" ) num, ofs = slice_extract_info(slice_list[0], self.shape[0]) return DatasetView( dataset=self, num_samples=num, offset=ofs, squeeze_dim=isinstance(slice_list[0], int), lazy=self.lazy, ) elif not slice_list: if subpath in self._tensors.keys(): tensorview = TensorView( dataset=self, subpath=subpath, slice_=slice(0, self.shape[0]), lazy=self.lazy, ) if self.lazy: return tensorview else: return tensorview.compute() return self._get_dictionary(subpath) else: num, ofs = slice_extract_info(slice_list[0], self.shape[0]) if subpath in self._tensors.keys(): tensorview = TensorView(dataset=self, subpath=subpath, slice_=slice_list, lazy=self.lazy) if self.lazy: return tensorview else: return tensorview.compute() if len(slice_list) > 1: raise ValueError("You can't slice a dictionary of Tensors") return self._get_dictionary(subpath, slice_list[0])
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds_view = ds[5:15] >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image """ self.dataset._auto_checkout() assign_value = get_value(value) assign_value = str_to_int( assign_value, self.dataset.tokenizer) # handling strings and bytes if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = [0] + slice_list if isinstance(self.indexes, int) else slice_list if not subpath: raise ValueError("Can't assign to dataset sliced without key") elif subpath not in self.keys: raise KeyError(f"Key {subpath} not found in dataset") if not slice_list: slice_ = (slice(self.indexes[0], self.indexes[-1] + 1) if self.is_contiguous else self.indexes) if not isinstance(slice_, list): self.dataset._tensors[subpath][slice_] = assign_value else: for i, index in enumerate(slice_): self.dataset._tensors[subpath][index] = assign_value[i] else: if isinstance(self.indexes, list): indexes = self.indexes[slice_list[0]] if self.is_contiguous and isinstance(indexes, list) and indexes: slice_list[0] = slice(indexes[0], indexes[-1] + 1) else: slice_list[0] = indexes else: slice_list[0] = self.indexes if not isinstance(slice_list[0], list): self.dataset._tensors[subpath][slice_list] = assign_value else: for i, index in enumerate(slice_list[0]): current_slice = [index] + slice_list[1:] self.dataset._tensors[subpath][ current_slice] = assign_value[i]
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> images_tensorview = ds["image"] >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image """ assign_value = get_value(value) # handling strings and bytes assign_value = str_to_int(assign_value, self.dataset.tokenizer) if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) slice_ = self.slice_fill(slice_) subpath, slice_list = slice_split(slice_) if subpath: raise ValueError("Can't setitem of TensorView with subpath") new_nums = self.nums.copy() new_offsets = self.offsets.copy() if isinstance(self.indexes, list): new_indexes = self.indexes[slice_list[0]] if self.is_contiguous and new_indexes: new_indexes = slice(new_indexes[0], new_indexes[-1] + 1) elif isinstance(self.indexes, int): new_indexes = self.indexes else: ofs = self.indexes.start or 0 num = self.indexes.stop - ofs if self.indexes.stop else None new_indexes = self._combine(slice_list[0], num, ofs) slice_list[0] = new_indexes if len(new_nums) < len(slice_list): new_nums.extend([None] * (len(slice_list) - len(new_nums))) new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) for i in range(1, len(slice_list)): slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) for i in range(len(slice_list), len(new_nums)): cur_slice = ( slice(new_offsets[i], new_offsets[i] + new_nums[i]) if not self.squeeze_dims[i] else new_offsets[i] ) slice_list.append(cur_slice) if isinstance(slice_list[0], (int, slice)): self.dataset._tensors[self.subpath][slice_list] = assign_value else: for i, index in enumerate(slice_list[0]): current_slice = [index] + slice_list[1:] self.dataset._tensors[subpath][current_slice] = assign_value[i]
def __getitem__(self, slice_): """| Gets a slice or slices from tensorview | Usage: >>> images_tensorview = ds["image"] >>> return images_tensorview[7, 0:1920, 0:1080, 0:3].compute() # returns numpy array of 7th image """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) slice_ = self.slice_fill(slice_) subpath, slice_list = slice_split(slice_) new_nums = self.nums.copy() new_offsets = self.offsets.copy() if len(new_nums) < len(slice_list): new_nums.extend([None] * (len(slice_list) - len(new_nums))) new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) for i in range(len(slice_list)): slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) for i in range(len(slice_list), len(new_nums)): cur_slice = ( slice(new_offsets[i], new_offsets[i] + new_nums[i]) if new_nums[i] > 1 else new_offsets[i] ) slice_list.append(cur_slice) if subpath or ( len(slice_list) > len(self.nums) and isinstance(self.dtype, Sequence) ): objectview = ObjectView( dataset=self.dataset, subpath=self.subpath + subpath, slice_list=slice_list, lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() else: tensorview = TensorView( dataset=self.dataset, subpath=self.subpath, slice_=slice_list, lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute()
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds["image", 5, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") >>> images = ds["image"] >>> image = images[5] >>> image[0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") """ if "r" in self._mode: raise ReadModeException("__setitem__") self._auto_checkout() if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if not subpath: raise ValueError("Can't assign to dataset sliced without subpath") elif subpath not in self.keys: raise KeyError(f"Key {subpath} not found in the dataset") assign_value = get_value(value) schema_dict = self.schema if subpath[1:] in schema_dict.dict_.keys(): schema_key = schema_dict.dict_.get(subpath[1:], None) else: for schema_key in subpath[1:].split("/"): schema_dict = schema_dict.dict_.get(schema_key, None) if not isinstance(schema_dict, SchemaDict): schema_key = schema_dict if isinstance(schema_key, ClassLabel): assign_value = check_class_label(assign_value, schema_key) if isinstance( schema_key, (Text, bytes)) or (isinstance(assign_value, Iterable) and any( isinstance(val, str) for val in assign_value)): # handling strings and bytes assign_value = str_to_int(assign_value, self.tokenizer) if not slice_list: self._tensors[subpath][:] = assign_value else: self._tensors[subpath][slice_list] = assign_value
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> images_tensorview = ds["image"] >>> images_tensorview[7, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets 7th image """ assign_value = get_value(value) # handling strings and bytes assign_value = str_to_int(assign_value, self.dataset.tokenizer) if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) slice_ = self.slice_fill(slice_) subpath, slice_list = slice_split(slice_) new_nums = self.nums.copy() new_offsets = self.offsets.copy() if len(new_nums) < len(slice_list): new_nums.extend([None] * (len(slice_list) - len(new_nums))) new_offsets.extend([0] * (len(slice_list) - len(new_offsets))) for i in range(len(slice_list)): slice_list[i] = self._combine(slice_list[i], new_nums[i], new_offsets[i]) for i in range(len(slice_list), len(new_nums)): cur_slice = ( slice(new_offsets[i], new_offsets[i] + new_nums[i]) if new_nums[i] > 1 else new_offsets[i] ) slice_list.append(cur_slice) if subpath or ( len(slice_list) > len(self.nums) and isinstance(self.dtype, Sequence) ): ObjectView( dataset=self.dataset, subpath=self.subpath + subpath, slice_list=slice_list, )[:] = assign_value else: self.dataset._tensors[self.subpath][slice_list] = assign_value
def __getitem__(self, slice_): """| Gets a slice from an objectview""" if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) nums, offsets, squeeze_dims, inner_schema_obj = ( self.nums.copy(), self.offsets.copy(), self.squeeze_dims.copy(), self.inner_schema_obj, ) if subpath: inner_schema_obj, nums, offsets, squeeze_dims = self.process_path( subpath, inner_schema_obj, nums, offsets, squeeze_dims) subpath = self.subpath + subpath new_indexes = self.indexes if len(slice_list) >= 1: if isinstance(self.indexes, list): new_indexes = self.indexes[slice_list[0]] if self.is_contiguous and new_indexes: new_indexes = slice(new_indexes[0], new_indexes[-1] + 1) slice_list = slice_list[1:] elif isinstance(self.indexes, slice): ofs = self.indexes.start or 0 num = self.indexes.stop - ofs if self.indexes.stop else None num, ofs_temp = slice_extract_info(slice_list[0], num) new_indexes = (ofs + ofs_temp if isinstance( slice_list[0], int) else slice(ofs + ofs_temp, ofs + ofs_temp + num)) slice_list = slice_list[1:] if len(slice_list) >= 1: # Expand slice list exp_slice_list = [] for squeeze in squeeze_dims: if squeeze: exp_slice_list += [None] else: if len(slice_list) > 0: exp_slice_list += [slice_list.pop(0)] else: # slice list smaller than max exp_slice_list += [None] if len(slice_list) > 0: # slice list longer than max raise IndexError("Too many indices") for i, it in enumerate(exp_slice_list): if it is not None: num, ofs = slice_extract_info(it, nums[i]) nums[i] = num offsets[i] += ofs squeeze_dims[i] = isinstance(it, int) objectview = ObjectView( dataset=self.dataset, subpath=subpath, slice_=None, indexes=new_indexes, nums=nums, offsets=offsets, squeeze_dims=squeeze_dims, inner_schema_obj=inner_schema_obj, lazy=self.lazy, check_bounds=False, ) return objectview if self.lazy else objectview.compute()
def __getitem__(self, slice_): """| Gets a slice or slices from DatasetView | Usage: >>> ds_view = ds[5:15] >>> return ds_view["image", 7, 0:1920, 0:1080, 0:3].compute() # returns numpy array of 12th image """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = [0] + slice_list if isinstance(self.indexes, int) else slice_list if not subpath: if len(slice_list) > 1: raise ValueError( "Can't slice dataset with multiple slices without key") indexes = self.indexes[slice_list[0]] return DatasetView(dataset=self.dataset, lazy=self.lazy, indexes=indexes) elif not slice_list: slice_ = ([slice(self.indexes[0], self.indexes[-1] + 1)] if self.is_contiguous else [self.indexes]) if subpath in self.keys: tensorview = TensorView( dataset=self.dataset, subpath=subpath, slice_=slice_, lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute() for key in self.keys: if subpath.startswith(key): objectview = ObjectView( dataset=self.dataset, subpath=subpath, slice_=slice_, lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() return self._get_dictionary(subpath, slice_) else: if isinstance(self.indexes, list): indexes = self.indexes[slice_list[0]] if self.is_contiguous and isinstance(indexes, list) and indexes: indexes = slice(indexes[0], indexes[-1] + 1) else: indexes = self.indexes slice_list[0] = indexes schema_obj = self.dataset.schema.dict_[subpath.split("/")[1]] if subpath in self.keys and (not isinstance(schema_obj, Sequence) or len(slice_list) <= 1): tensorview = TensorView( dataset=self.dataset, subpath=subpath, slice_=slice_list, lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute() for key in self.keys: if subpath.startswith(key): objectview = ObjectView( dataset=self.dataset, subpath=subpath, slice_=slice_list, lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() if len(slice_list) > 1: raise ValueError("You can't slice a dictionary of Tensors") return self._get_dictionary(subpath, slice_list[0])
def __setitem__(self, slice_, value): """| Sets a slice or slices with a value | Usage: >>> ds_view = ds[5:15] >>> ds_view["image", 3, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8") # sets the 8th image """ self.dataset._auto_checkout() if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = [0] + slice_list if isinstance(self.indexes, int) else slice_list assign_value = get_value(value) schema_dict = self.dataset.schema if subpath[1:] in schema_dict.dict_.keys(): schema_key = schema_dict.dict_.get(subpath[1:], None) else: for schema_key in subpath[1:].split("/"): schema_dict = schema_dict.dict_.get(schema_key, None) if not isinstance(schema_dict, SchemaDict): schema_key = schema_dict if isinstance(schema_key, ClassLabel): assign_value = check_class_label(assign_value, schema_key) if isinstance( schema_key, (Text, bytes)) or (isinstance(assign_value, Iterable) and any( isinstance(val, str) for val in assign_value)): # handling strings and bytes assign_value = str_to_int(assign_value, self.dataset.tokenizer) if not subpath: raise ValueError("Can't assign to dataset sliced without key") elif subpath not in self.keys: raise KeyError(f"Key {subpath} not found in dataset") if not slice_list: slice_ = (slice(self.indexes[0], self.indexes[-1] + 1) if self.is_contiguous else self.indexes) if not isinstance(slice_, list): self.dataset._tensors[subpath][slice_] = assign_value else: for i, index in enumerate(slice_): self.dataset._tensors[subpath][index] = assign_value[i] else: if isinstance(self.indexes, list): indexes = self.indexes[slice_list[0]] if self.is_contiguous and isinstance(indexes, list) and indexes: slice_list[0] = slice(indexes[0], indexes[-1] + 1) else: slice_list[0] = indexes else: slice_list[0] = self.indexes if not isinstance(slice_list[0], list): self.dataset._tensors[subpath][slice_list] = assign_value else: for i, index in enumerate(slice_list[0]): current_slice = [index] + slice_list[1:] self.dataset._tensors[subpath][ current_slice] = assign_value[i]
def __getitem__(self, slice_): """| Gets a slice or slices from DatasetView | Usage: >>> ds_view = ds[5:15] >>> return ds_view["image", 7, 0:1920, 0:1080, 0:3].compute() # returns numpy array of 12th image """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) slice_list = [0] + slice_list if self.squeeze_dim else slice_list if not subpath: if len(slice_list) > 1: raise ValueError( "Can't slice a dataset with multiple slices without subpath" ) num, ofs = slice_extract_info(slice_list[0], self.num_samples) return DatasetView( dataset=self.dataset, num_samples=num, offset=ofs + self.offset, squeeze_dim=isinstance(slice_list[0], int), lazy=self.lazy, ) elif not slice_list: slice_ = ( slice(self.offset, self.offset + self.num_samples) if not self.squeeze_dim else self.offset ) if subpath in self.dataset._tensors.keys(): tensorview = TensorView( dataset=self.dataset, subpath=subpath, slice_=slice_, lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute() for key in self.dataset._tensors.keys(): if subpath.startswith(key): objectview = ObjectView( dataset=self.dataset, subpath=subpath, slice_list=[slice_], lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() return self._get_dictionary(self.dataset, subpath, slice=slice_) else: num, ofs = slice_extract_info(slice_list[0], self.num_samples) slice_list[0] = ( ofs + self.offset if isinstance(slice_list[0], int) else slice(ofs + self.offset, ofs + self.offset + num) ) schema_obj = self.dataset.schema.dict_[subpath.split("/")[1]] if subpath in self.dataset._tensors.keys() and ( not isinstance(schema_obj, Sequence) or len(slice_list) <= 1 ): tensorview = TensorView( dataset=self.dataset, subpath=subpath, slice_=slice_list, lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute() for key in self.dataset._tensors.keys(): if subpath.startswith(key): objectview = ObjectView( dataset=self.dataset, subpath=subpath, slice_list=slice_list, lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() if len(slice_list) > 1: raise ValueError("You can't slice a dictionary of Tensors") return self._get_dictionary(subpath, slice_list[0])
def __getitem__(self, slice_): """| Gets a slice or slices from dataset | Usage: >>> return ds["image", 5, 0:1920, 0:1080, 0:3].compute() # returns numpy array >>> images = ds["image"] >>> return images[5].compute() # returns numpy array >>> images = ds["image"] >>> image = images[5] >>> return image[0:1920, 0:1080, 0:3].compute() """ if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) if not subpath: if len(slice_list) > 1: raise ValueError( "Can't slice a dataset with multiple slices without key") indexes = self.indexes[slice_list[0]] return DatasetView( dataset=self, indexes=indexes, lazy=self.lazy, ) elif not slice_list: if subpath in self.keys: tensorview = TensorView( dataset=self, subpath=subpath, slice_=slice(0, self._shape[0]), lazy=self.lazy, ) return tensorview if self.lazy else tensorview.compute() for key in self.keys: if subpath.startswith(key): objectview = ObjectView( dataset=self, subpath=subpath, lazy=self.lazy, slice_=[slice(0, self._shape[0])], ) return objectview if self.lazy else objectview.compute() return self._get_dictionary(subpath) else: schema_obj = self.schema.dict_[subpath.split("/")[1]] if subpath in self.keys and (not isinstance(schema_obj, Sequence) or len(slice_list) <= 1): tensorview = TensorView(dataset=self, subpath=subpath, slice_=slice_list, lazy=self.lazy) return tensorview if self.lazy else tensorview.compute() for key in self.keys: if subpath.startswith(key): objectview = ObjectView( dataset=self, subpath=subpath, slice_=slice_list, lazy=self.lazy, ) return objectview if self.lazy else objectview.compute() if len(slice_list) > 1: raise ValueError("You can't slice a dictionary of Tensors") return self._get_dictionary(subpath, slice_list[0])
def __getitem__(self, slice_): """| Gets a slice from an objectview""" if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str): slice_ = [slice_] slice_ = list(slice_) subpath, slice_list = slice_split(slice_) dataset = self.dataset nums, offsets, squeeze_dims, inner_schema_obj = ( self.nums.copy(), self.offsets.copy(), self.squeeze_dims.copy(), self.inner_schema_obj, ) if subpath: inner_schema_obj, nums, offsets, squeeze_dims = self.process_path( subpath, inner_schema_obj, nums, offsets, squeeze_dims) subpath = self.subpath + subpath if len(slice_list) >= 1: # Slice first dim if isinstance(self.dataset, DatasetView) and not self.dataset.squeeze_dim: dataset = self.dataset[slice_list[0]] slice_list = slice_list[1:] elif not isinstance(self.dataset, DatasetView): num, ofs = slice_extract_info(slice_list[0], self.dataset.shape[0]) dataset = DatasetView(self.dataset, num, ofs, isinstance(slice_list[0], int)) slice_list = slice_list[1:] # Expand slice list for rest of dims if len(slice_list) >= 1: exp_slice_list = [] for squeeze in squeeze_dims: if squeeze: exp_slice_list += [None] else: if len(slice_list) > 0: exp_slice_list += [slice_list.pop(0)] else: # slice list smaller than max exp_slice_list += [None] if len(slice_list) > 0: # slice list longer than max raise IndexError("Too many indices") for i, it in enumerate(exp_slice_list): if it is not None: num, ofs = slice_extract_info(it, nums[i]) nums[i] = num offsets[i] += ofs squeeze_dims[i] = num == 1 objectview = ObjectView( dataset=dataset, subpath=subpath, slice_list=None, nums=nums, offsets=offsets, squeeze_dims=squeeze_dims, inner_schema_obj=inner_schema_obj, lazy=self.lazy, new=False, ) return objectview if self.lazy else objectview.compute()