Ejemplo n.º 1
0
    def __getitem__(self, slice_):
        """| Get an item to be computed without iterating on the whole dataset.
        | Creates a dataset view, then a temporary dataset to apply the transform.
        Parameters:
        ----------
        slice_: slice
            Gets a slice or slices from dataset
        """
        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]

        slice_ = list(slice_)
        subpath, slice_list = slice_split(slice_)

        if len(slice_list) == 0:
            slice_list = [slice(None, None, None)]

        num, ofs = slice_extract_info(slice_list[0], self.shape[0])

        ds_view = DatasetView(
            dataset=self._ds,
            num_samples=num,
            offset=ofs,
            squeeze_dim=isinstance(slice_list[0], int),
        )

        path = posixpath.expanduser("~/.activeloop/tmparray")
        new_ds = self.store(path, length=num, ds=ds_view, progressbar=False)

        index = 1 if len(slice_) > 1 else 0
        slice_[index] = (slice(None, None, None)
                         if not isinstance(slice_list[0], int) else 0
                         )  # Get all shape dimension since we already sliced
        return new_ds[slice_]
Ejemplo n.º 2
0
    def filter(self, fn):
        """| Applies a function on each element one by one as a filter to get a new DatasetView

        Parameters
        ----------
        fn: function
            Should take in a single sample of the dataset and return True or False
            This function is applied to all the items of the datasetview and retains those items that return True
        """
        indexes = [index for index in self.indexes if fn(self[index])]
        return DatasetView(dataset=self, lazy=self.lazy, indexes=indexes)
Ejemplo n.º 3
0
 def __getitem__(self, slice_):
     """| Gets a slice or slices from dataset
     | Usage:
     >>> return ds["image", 5, 0:1920, 0:1080, 0:3].compute() # returns numpy array
     >>> images = ds["image"]
     >>> return images[5].compute() # returns numpy array
     >>> images = ds["image"]
     >>> image = images[5]
     >>> return image[0:1920, 0:1080, 0:3].compute()
     """
     if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
         slice_ = [slice_]
     slice_ = list(slice_)
     subpath, slice_list = slice_split(slice_)
     if not subpath:
         if len(slice_list) > 1:
             raise ValueError(
                 "Can't slice a dataset with multiple slices without subpath"
             )
         num, ofs = slice_extract_info(slice_list[0], self.shape[0])
         return DatasetView(
             dataset=self,
             num_samples=num,
             offset=ofs,
             squeeze_dim=isinstance(slice_list[0], int),
             lazy=self.lazy,
         )
     elif not slice_list:
         if subpath in self._tensors.keys():
             tensorview = TensorView(
                 dataset=self,
                 subpath=subpath,
                 slice_=slice(0, self.shape[0]),
                 lazy=self.lazy,
             )
             if self.lazy:
                 return tensorview
             else:
                 return tensorview.compute()
         return self._get_dictionary(subpath)
     else:
         num, ofs = slice_extract_info(slice_list[0], self.shape[0])
         if subpath in self._tensors.keys():
             tensorview = TensorView(dataset=self,
                                     subpath=subpath,
                                     slice_=slice_list,
                                     lazy=self.lazy)
             if self.lazy:
                 return tensorview
             else:
                 return tensorview.compute()
         if len(slice_list) > 1:
             raise ValueError("You can't slice a dictionary of Tensors")
         return self._get_dictionary(subpath, slice_list[0])
Ejemplo n.º 4
0
    def filter(self, dic):
        """| Applies a filter to get a new datasetview that matches the dictionary provided

        Parameters
        ----------
        dic: dictionary
            A dictionary of key value pairs, used to filter the dataset. For nested schemas use flattened dictionary representation
            i.e instead of {"abc": {"xyz" : 5}} use {"abc/xyz" : 5}
        """
        indexes = self.indexes
        for k, v in dic.items():
            k = k if k.startswith("/") else "/" + k
            if k not in self.keys:
                raise KeyError(f"Key {k} not found in the dataset")
            tsv = self[k]
            max_shape = tsv.dtype.max_shape
            prod = _tuple_product(max_shape)
            if prod > 100:
                raise LargeShapeFilteringException(k)
            indexes = [index for index in indexes if tsv[index].compute() == v]
        return DatasetView(dataset=self, lazy=self.lazy, indexes=indexes)
Ejemplo n.º 5
0
 def __getitem__(self, slice_):
     """| Gets a slice or slices from dataset
     | Usage:
     >>> return ds["image", 5, 0:1920, 0:1080, 0:3].compute() # returns numpy array
     >>> images = ds["image"]
     >>> return images[5].compute() # returns numpy array
     >>> images = ds["image"]
     >>> image = images[5]
     >>> return image[0:1920, 0:1080, 0:3].compute()
     """
     if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
         slice_ = [slice_]
     slice_ = list(slice_)
     subpath, slice_list = slice_split(slice_)
     if not subpath:
         if len(slice_list) > 1:
             raise ValueError(
                 "Can't slice a dataset with multiple slices without key")
         indexes = self.indexes[slice_list[0]]
         return DatasetView(
             dataset=self,
             indexes=indexes,
             lazy=self.lazy,
         )
     elif not slice_list:
         if subpath in self.keys:
             tensorview = TensorView(
                 dataset=self,
                 subpath=subpath,
                 slice_=slice(0, self._shape[0]),
                 lazy=self.lazy,
             )
             return tensorview if self.lazy else tensorview.compute()
         for key in self.keys:
             if subpath.startswith(key):
                 objectview = ObjectView(
                     dataset=self,
                     subpath=subpath,
                     lazy=self.lazy,
                     slice_=[slice(0, self._shape[0])],
                 )
                 return objectview if self.lazy else objectview.compute()
         return self._get_dictionary(subpath)
     else:
         schema_obj = self.schema.dict_[subpath.split("/")[1]]
         if subpath in self.keys and (not isinstance(schema_obj, Sequence)
                                      or len(slice_list) <= 1):
             tensorview = TensorView(dataset=self,
                                     subpath=subpath,
                                     slice_=slice_list,
                                     lazy=self.lazy)
             return tensorview if self.lazy else tensorview.compute()
         for key in self.keys:
             if subpath.startswith(key):
                 objectview = ObjectView(
                     dataset=self,
                     subpath=subpath,
                     slice_=slice_list,
                     lazy=self.lazy,
                 )
                 return objectview if self.lazy else objectview.compute()
         if len(slice_list) > 1:
             raise ValueError("You can't slice a dictionary of Tensors")
         return self._get_dictionary(subpath, slice_list[0])
Ejemplo n.º 6
0
    def __getitem__(self, slice_):
        """| Gets a slice from an objectview"""
        if not isinstance(slice_, abc.Iterable) or isinstance(slice_, str):
            slice_ = [slice_]
        slice_ = list(slice_)
        subpath, slice_list = slice_split(slice_)

        dataset = self.dataset
        nums, offsets, squeeze_dims, inner_schema_obj = (
            self.nums.copy(),
            self.offsets.copy(),
            self.squeeze_dims.copy(),
            self.inner_schema_obj,
        )

        if subpath:
            inner_schema_obj, nums, offsets, squeeze_dims = self.process_path(
                subpath, inner_schema_obj, nums, offsets, squeeze_dims)
        subpath = self.subpath + subpath
        if len(slice_list) >= 1:
            # Slice first dim
            if isinstance(self.dataset,
                          DatasetView) and not self.dataset.squeeze_dim:
                dataset = self.dataset[slice_list[0]]
                slice_list = slice_list[1:]
            elif not isinstance(self.dataset, DatasetView):
                num, ofs = slice_extract_info(slice_list[0],
                                              self.dataset.shape[0])
                dataset = DatasetView(self.dataset, num, ofs,
                                      isinstance(slice_list[0], int))
                slice_list = slice_list[1:]

            # Expand slice list for rest of dims
            if len(slice_list) >= 1:
                exp_slice_list = []
                for squeeze in squeeze_dims:
                    if squeeze:
                        exp_slice_list += [None]
                    else:
                        if len(slice_list) > 0:
                            exp_slice_list += [slice_list.pop(0)]
                        else:
                            # slice list smaller than max
                            exp_slice_list += [None]
                if len(slice_list) > 0:
                    # slice list longer than max
                    raise IndexError("Too many indices")
                for i, it in enumerate(exp_slice_list):
                    if it is not None:
                        num, ofs = slice_extract_info(it, nums[i])
                        nums[i] = num
                        offsets[i] += ofs
                        squeeze_dims[i] = num == 1
        objectview = ObjectView(
            dataset=dataset,
            subpath=subpath,
            slice_list=None,
            nums=nums,
            offsets=offsets,
            squeeze_dims=squeeze_dims,
            inner_schema_obj=inner_schema_obj,
            lazy=self.lazy,
            new=False,
        )
        return objectview if self.lazy else objectview.compute()
Ejemplo n.º 7
0
    def __init__(
        self,
        dataset,
        subpath=None,
        slice_list=None,
        nums=[],
        offsets=[],
        squeeze_dims=[],
        inner_schema_obj=None,
        lazy=True,
        new=True,
    ):
        """Creates an ObjectView object for dataset from a Dataset, DatasetView or TensorView
        object, or creates a different ObjectView from an existing one

        Parameters
        ----------
        These parameters are used to create a new ObjectView.
        dataset: hub.api.dataset.Dataset object
            The dataset whose ObjectView is being created, or its DatasetView
        subpath: str (optional)
            A potentially incomplete path to any element in the Dataset
        slice_list: optional
            The `slice_` of this Tensor that needs to be accessed
        lazy: bool, optional
            Setting this to False will stop lazy computation and will allow items to be accessed without .compute()

        These parameters are also needed to create an ObjectView from an existing one.
        nums: List[int]
            Number of elements in each dimension of the ObjectView to be created
        offsets: List[int]
            Starting element in each dimension of the ObjectView to be created
        squeeze_dims: List[bool]
            Whether each dimension can be squeezed or not
        inner_schema_obj: Child of hub.schema.Tensor or hub.schema.SchemaDict
            The deepest element in the schema upto which the previous ObjectView had been processed

        new: bool
            Whether to create a new ObjectView object from a Dataset, DatasetView or TensorView
            or create a different ObjectView from an existing one
        """
        self.dataset = dataset
        self.schema = (dataset.schema.dict_
                       if not isinstance(dataset, DatasetView) else
                       dataset.dataset.schema.dict_)
        self.subpath = subpath

        self.nums = nums
        self.offsets = offsets
        self.squeeze_dims = squeeze_dims

        self.inner_schema_obj = inner_schema_obj
        self.lazy = lazy

        if new:
            # Creating new obj
            if self.subpath:
                (
                    self.inner_schema_obj,
                    self.nums,
                    self.offsets,
                    self.squeeze_dims,
                ) = self.process_path(
                    self.subpath,
                    self.inner_schema_obj,
                    self.nums.copy(),
                    self.offsets.copy(),
                    self.squeeze_dims.copy(),
                )
            # Check if dataset view needs to be made
            if slice_list and len(slice_list) >= 1:
                num, ofs = slice_extract_info(slice_list[0], dataset.shape[0])
                self.dataset = DatasetView(dataset, num, ofs,
                                           isinstance(slice_list[0], int))

            if slice_list and len(slice_list) > 1:
                slice_list = slice_list[1:]
                if len(slice_list) > len(self.nums):
                    raise IndexError("Too many indices")
                for i, it in enumerate(slice_list):
                    num, ofs = slice_extract_info(it, self.nums[i])
                    self.nums[i] = num
                    self.offsets[i] += ofs
                    self.squeeze_dims[i] = num == 1