def _to_pytorch( dataset, transform=None, inplace=True, output_type=dict, indexes=None, ): """| Converts the dataset into a pytorch compatible format. Parameters ---------- transform: function that transforms data in a dict format inplace: bool, optional Defines if data should be converted to torch.Tensor before or after Transforms applied (depends on what data type you need for Transforms). Default is True. output_type: one of list, tuple, dict, optional Defines the output type. Default is dict - same as in original Hub Dataset. indexes: list or int, optional The samples to be converted into tensorflow format. Takes all samples in dataset by default. """ try: import torch except ModuleNotFoundError: raise ModuleNotInstalledException("torch") global torch indexes = indexes or dataset.indexes if "r" not in dataset.mode: dataset.flush() # FIXME Without this some tests in test_converters.py fails, not clear why return TorchDataset( dataset, transform, inplace=inplace, output_type=output_type, indexes=indexes )
def to_pytorch( self, transform=None, inplace=True, output_type=dict, indexes=None, ): """| Converts the dataset into a pytorch compatible format. Parameters ---------- transform: function that transforms data in a dict format inplace: bool, optional Defines if data should be converted to torch.Tensor before or after Transforms applied (depends on what data type you need for Transforms). Default is True. output_type: one of list, tuple, dict, optional Defines the output type. Default is dict - same as in original Hub Dataset. offset: int, optional The offset from which dataset needs to be converted num_samples: int, optional The number of samples required of the dataset that needs to be converted """ try: import torch except ModuleNotFoundError: raise ModuleNotInstalledException("torch") global torch indexes = indexes or self.indexes if "r" not in self.mode: self.flush() # FIXME Without this some tests in test_converters.py fails, not clear why return TorchDataset( self, transform, inplace=inplace, output_type=output_type, indexes=indexes )
def test_exceptions(): HubException() AuthenticationException() AuthorizationException(Response()) AuthorizationException(Response(noerror=True)) NotFoundException() BadRequestException(Response()) BadRequestException(Response(noerror=True)) OverLimitException() ServerException() BadGatewayException() GatewayTimeoutException() WaitTimeoutException() LockedException() HubDatasetNotFoundException("Hello") PermissionException("Hello") ShapeLengthException() ShapeArgumentNotFoundException() SchemaArgumentNotFoundException() ValueShapeError("Shape 1", "Shape 2") NoneValueException("Yahoo!") ModuleNotInstalledException("my_module") WrongUsernameException("usernameX") NotHubDatasetToOverwriteException() NotHubDatasetToAppendException() DynamicTensorNotFoundException() DynamicTensorShapeException("none") DynamicTensorShapeException("length") DynamicTensorShapeException("not_equal") DynamicTensorShapeException("another_cause")
def from_pytorch(dataset): """| Converts a pytorch dataset object into hub format Parameters ---------- dataset: The pytorch dataset object that needs to be converted into hub format""" if "torch" not in sys.modules: raise ModuleNotInstalledException("torch") else: import torch global torch def generate_schema(dataset): sample = dataset[0] return dict_to_hub(sample).dict_ def dict_to_hub(d): for k, v in d.items(): k = k.replace("/", "_") if isinstance(v, dict): d[k] = dict_to_hub(v) else: value_shape = v.shape if hasattr(v, "shape") else () shape = tuple([None for it in value_shape]) max_shape = tuple([10000 for it in value_shape]) if isinstance(v, torch.Tensor): v = v.numpy() dtype = v.dtype.name if hasattr(v, "dtype") else type(v) dtype = "int64" if isinstance(v, str) else dtype d[k] = ( Tensor(shape=shape, dtype=dtype, max_shape=max_shape) if not isinstance(v, str) else Text(shape=(None,), dtype=dtype, max_shape=(10000,)) ) return SchemaDict(d) my_schema = generate_schema(dataset) def transform_numpy(sample): d = {} for k, v in sample.items(): k = k.replace("/", "_") if not isinstance(v, dict): d[k] = v else: d[k] = transform_numpy(v) return d @hub.transform(schema=my_schema) def my_transform(sample): return transform_numpy(sample) return my_transform(dataset)
def __init__(self, func, schema, ds, scheduler="ray", workers=1, **kwargs): super(RayTransform, self).__init__( func, schema, ds, scheduler="single", workers=workers, **kwargs ) self.workers = workers if "ray" not in sys.modules: raise ModuleNotInstalledException("ray") if not ray.is_initialized(): ray.init(local_mode=True)
def _to_supervisely(dataset, output): try: import supervisely_lib as sly from skvideo.io import vwrite except ModuleNotFoundError: raise ModuleNotInstalledException("supervisely") schema_dict = dataset.schema.dict_ for key, schem in schema_dict.items(): if isinstance(schem, Image): project_type = "images" extension = "jpeg" break elif isinstance(schem, Video): project_type = "videos" extension = "mp4" break else: raise Exception mode = sly.OpenMode.CREATE if project_type == "images": _project = sly.Project elif project_type == "videos": _project = sly.VideoProject else: raise Exception pr = _project(output, mode) meta = pr.meta meta._project_type = project_type # probably here we can create multiple datasets out_ds = pr.create_dataset(output) try: fn_key = "filename" dataset[fn_key] except KeyError: fn_key = None zeroes = len(str(len(dataset))) for idx, view in enumerate(dataset): obj = view[key].compute() if fn_key: fn = view[fn_key].compute() else: fn = f"{idx:0{zeroes}}" fn = "{}.{}".format(fn, extension) # strangely supervisely prevents from using this method on videos try: out_ds.add_item_np(fn, obj) except RuntimeError: # fix with in-memory file path = "{}/{}".format(out_ds.item_dir, fn) vwrite(path, obj) out_ds._item_to_ann[fn] = fn + ".json" out_ds.set_ann(fn, out_ds._get_empty_annotaion(path)) pr.set_meta(meta) return pr
def _to_pytorch( dataset, transform=None, inplace=True, output_type=dict, indexes=None, key_list=None, shuffle=False, ): """| Converts the dataset into a pytorch compatible format. Parameters ---------- transform: function that transforms data in a dict format inplace: bool, optional Defines if data should be converted to torch.Tensor before or after Transforms applied (depends on what data type you need for Transforms). Default is True. output_type: one of list, tuple, dict, optional Defines the output type. Default is dict - same as in original Hub Dataset. indexes: list or int, optional The samples to be converted into Pytorch format. Takes all samples in dataset by default. key_list: list, optional The list of keys that are needed in Pytorch format. For nested schemas such as {"a":{"b":{"c": Tensor()}}} use ["a/b/c"] as key_list shuffle: bool, optional whether to shuffle the data chunkwise or not. Default is False. """ try: import torch except ModuleNotFoundError: raise ModuleNotInstalledException("torch") global torch indexes = indexes or dataset.indexes if "r" not in dataset.mode: dataset.flush( ) # FIXME Without this some tests in test_converters.py fails, not clear why return TorchDataset( dataset, transform, inplace=inplace, output_type=output_type, indexes=indexes, key_list=key_list, shuffle=shuffle, )
def __init__(self, tensors: Dict[str, Tensor], metainfo=dict()): """Creates dict given dict of tensors (name -> Tensor key value pairs)""" self._tensors = tensors self._metainfo = metainfo shape = None for name, tensor in tensors.items(): if shape is None or tensor.ndim > len(shape): shape = tensor.shape self._len = tensor.count self.verison = "0.x" if "dask" not in sys.modules: raise ModuleNotInstalledException("dask") else: import dask import dask.array global dask
def str_to_int(assign_value, tokenizer): if isinstance(assign_value, bytes): try: assign_value = assign_value.decode("utf-8") except Exception: raise ValueError( "Bytes couldn't be decoded to string. Other encodings of bytes are currently not supported" ) if ( isinstance(assign_value, np.ndarray) and assign_value.dtype.type is np.bytes_ ) or (isinstance(assign_value, list) and isinstance(assign_value[0], bytes)): assign_value = [item.decode("utf-8") for item in assign_value] if tokenizer is not None: if "transformers" not in sys.modules: raise ModuleNotInstalledException("transformers") import transformers global transformers tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased") assign_value = ( np.array(tokenizer(assign_value, add_special_tokens=False)["input_ids"]) if isinstance(assign_value, str) else assign_value ) if ( isinstance(assign_value, list) and assign_value and isinstance(assign_value[0], str) ): assign_value = [ np.array(tokenizer(item, add_special_tokens=False)["input_ids"]) for item in assign_value ] else: assign_value = ( np.array([ord(ch) for ch in assign_value]) if isinstance(assign_value, str) else assign_value ) if ( isinstance(assign_value, list) and assign_value and isinstance(assign_value[0], str) ): assign_value = [np.array([ord(ch) for ch in item]) for item in assign_value] return assign_value
def data_from_csv(path, scheduler, workers): try: import pandas as pd except ModuleNotFoundError: raise ModuleNotInstalledException("pandas") # check if path's contents are all csv files if not util.files_are_of_extension(path, util.CSV_EXTS): return None df = pd.DataFrame() files = util.get_children(path) for i in files: df_csv = pd.read_csv(i) df_csv["Filename"] = os.path.basename(i) df = pd.concat([df, df_csv]) schema = {str(i): df[i].dtype for i in df.columns} for keys in schema.keys(): if schema[keys] == np.dtype("O"): # Assigning max_shape as the length of the longest string in the column. schema[keys] = hub.schema.Text(shape=(None, ), max_shape=(int( df[keys].str.len().max()), )) # the below code is to check whether the column is a ClassLabel or not # elif schema[keys] == np.dtype("int64"): # if len(np.unique(df[keys])) <= 10: # schema[keys] = hub.schema.ClassLabel( # num_classes=len(np.unique(df[keys])) # ) # else: # schema[keys] = hub.schema.Primitive(dtype=schema[keys]) else: schema[keys] = hub.schema.Primitive(dtype=schema[keys]) @hub.transform(schema=schema, scheduler=scheduler, workers=workers) def upload_data(index, df): dictionary_cols = {} for column in df.columns: dictionary_cols[column] = df[column].iloc[index] return dictionary_cols return upload_data(range(len(df)), df=df)
def to_pytorch(self, Transform=None, offset=None, num_samples=None): """| Converts the dataset into a pytorch compatible format Parameters ---------- offset: int, optional The offset from which dataset needs to be converted num_samples: int, optional The number of samples required of the dataset that needs to be converted """ if "torch" not in sys.modules: raise ModuleNotInstalledException("torch") else: import torch global torch self.flush() # FIXME Without this some tests in test_converters.py fails, not clear why return TorchDataset(self, Transform, offset=offset, num_samples=num_samples)
def __init__(self, meta: dict, daskarray, delayed_objs: tuple = None): if "dask" not in sys.modules: raise ModuleNotInstalledException("dask") else: import dask import dask.array global dask if not meta.get("preprocessed"): meta = Tensor._preprocess_meta(meta, daskarray) self._meta = meta self._array = daskarray self._delayed_objs = delayed_objs self._shape = _dask_shape_backward(daskarray.shape) self._dtype = meta["dtype"] self._dtag = meta.get("dtag") self._dcompress = meta.get("dcompress") self._dcompress_algo = meta.get("dcompress_algo") self._dcompress_lvl = meta.get("dcompress_lvl") self._chunksize = meta.get("chunksize")
def from_array(array, dtag=None, dcompress=None, chunksize=None) -> Tensor: """Generates tensor from arraylike object Parameters ---------- array : np.ndarray Numpy array like object with shape, dtype, dims dtag : str, optional Describes type of the data stored in this array (image, mask, labels, ...) dcompress: str, optional Argument for compression algorithm, ignore this one, this one does not have any affect yet! chunksize: Information about how many items (from axis 0) should be stored in the same file if a command is given to save this tensor Returns ------- Tensor newly generated tensor itself """ if "dask" not in sys.modules: raise ModuleNotInstalledException("dask") else: import dask import dask.array global dask meta = { "dtype": array.dtype, "dtag": dtag, "dcompress": dcompress, "chunksize": chunksize, } if str(array.dtype) == "object": array = dask.array.from_array(array, chunks=1) else: array = dask.array.from_array(array) return Tensor(meta, array)
def to_tensorflow(self, indexes=None): """| Converts the dataset into a tensorflow compatible format Parameters ---------- offset: int, optional The offset from which dataset needs to be converted num_samples: int, optional The number of samples required of the dataset that needs to be converted """ try: import tensorflow as tf global tf except ModuleNotFoundError: raise ModuleNotInstalledException("tensorflow") indexes = indexes or self.indexes indexes = [indexes] if isinstance(indexes, int) else indexes _samples_in_chunks = { key: (None in value.shape) and 1 or value.chunks[0] for key, value in self._tensors.items() } _active_chunks = {} _active_chunks_range = {} def _get_active_item(key, index): active_range = _active_chunks_range.get(key) samples_per_chunk = _samples_in_chunks[key] if active_range is None or index not in active_range: active_range_start = index - index % samples_per_chunk active_range = range( active_range_start, active_range_start + samples_per_chunk ) _active_chunks_range[key] = active_range _active_chunks[key] = self._tensors[key][ active_range.start : active_range.stop ] return _active_chunks[key][index % samples_per_chunk] def tf_gen(): for index in indexes: d = {} for key in self.keys: split_key = key.split("/") cur = d for i in range(1, len(split_key) - 1): if split_key[i] in cur.keys(): cur = cur[split_key[i]] else: cur[split_key[i]] = {} cur = cur[split_key[i]] cur[split_key[-1]] = _get_active_item(key, index) yield (d) def dict_to_tf(my_dtype): d = {} for k, v in my_dtype.dict_.items(): d[k] = dtype_to_tf(v) return d def tensor_to_tf(my_dtype): return dtype_to_tf(my_dtype.dtype) def dtype_to_tf(my_dtype): if isinstance(my_dtype, SchemaDict): return dict_to_tf(my_dtype) elif isinstance(my_dtype, Tensor): return tensor_to_tf(my_dtype) elif isinstance(my_dtype, Primitive): if str(my_dtype._dtype) == "object": return "string" return str(my_dtype._dtype) def get_output_shapes(my_dtype): if isinstance(my_dtype, SchemaDict): return output_shapes_from_dict(my_dtype) elif isinstance(my_dtype, Tensor): return my_dtype.shape elif isinstance(my_dtype, Primitive): return () def output_shapes_from_dict(my_dtype): d = {} for k, v in my_dtype.dict_.items(): d[k] = get_output_shapes(v) return d output_types = dtype_to_tf(self._schema) output_shapes = get_output_shapes(self._schema) return tf.data.Dataset.from_generator( tf_gen, output_types=output_types, output_shapes=output_shapes )
def load(tag, creds=None, session_creds=True) -> Dataset: """Load a dataset from repository using given url and credentials (optional)""" fs, path = _load_fs_and_path(tag, creds, session_creds=session_creds) fs: fsspec.AbstractFileSystem = fs path_2 = f"{path}/meta.json" if not fs.exists(path_2): raise HubDatasetNotFoundException(tag) with fs.open(path_2, "r") as f: ds_meta = json.loads(f.read()) for name in ds_meta["tensors"]: assert fs.exists( f"{path}/{name}" ), f"Tensor {name} of {tag} dataset does not exist" if "dask" not in sys.modules: raise ModuleNotInstalledException("dask") else: import dask import dask.array global dask if ds_meta["len"] == 0: logger.warning("The dataset is empty (has 0 samples)") return Dataset( { name: Tensor( tmeta, dask.array.from_array( np.empty(shape=(0,) + tuple(tmeta["shape"][1:]), dtype="uint8"), ), ) for name, tmeta in ds_meta["tensors"].items() }, metainfo=ds_meta.get("metainfo"), ) len_ = ds_meta["len"] # added reverse compatibility for previous versions for name, tmeta in ds_meta["tensors"].items(): if "chunksize" not in tmeta: tmeta["chunksize"] = 1 return Dataset( { name: Tensor( tmeta, _dask_concat( [ dask.array.from_delayed( dask.delayed(_numpy_load)( fs, f"{path}/{name}/{i}.npy", codec_from_name(tmeta.get("dcompress")), ), shape=(min(tmeta["chunksize"], len_ - i),) + tuple(tmeta["shape"][1:]), dtype=tmeta["dtype"], ) for i in range(0, len_, tmeta["chunksize"]) ] ), ) for name, tmeta in ds_meta["tensors"].items() }, metainfo=ds_meta.get("metainfo"), )
def _from_pytorch(dataset, scheduler: str = "single", workers: int = 1): """| Converts a pytorch dataset object into hub format Parameters ---------- dataset: The pytorch dataset object that needs to be converted into hub format scheduler: str choice between "single", "threaded", "processed" workers: int how many threads or processes to use """ if "torch" not in sys.modules: raise ModuleNotInstalledException("torch") else: import torch global torch max_dict = defaultdict(lambda: None) def sampling(ds): for sample in ds: dict_sampling(sample) def dict_sampling(d, path=""): for k, v in d.items(): k = k.replace("/", "_") cur_path = path + "/" + k if isinstance(v, dict): dict_sampling(v, path=cur_path) elif isinstance(v, str): if cur_path not in max_dict.keys(): max_dict[cur_path] = (len(v),) else: max_dict[cur_path] = max(((len(v)),), max_dict[cur_path]) elif hasattr(v, "shape"): if cur_path not in max_dict.keys(): max_dict[cur_path] = v.shape else: max_dict[cur_path] = tuple( [max(value) for value in zip(max_dict[cur_path], v.shape)] ) sampling(dataset) def generate_schema(dataset): sample = dataset[0] return dict_to_hub(sample).dict_ def dict_to_hub(dic, path=""): d = {} for k, v in dic.items(): k = k.replace("/", "_") cur_path = path + "/" + k if isinstance(v, dict): d[k] = dict_to_hub(v, path=cur_path) else: value_shape = v.shape if hasattr(v, "shape") else () if isinstance(v, torch.Tensor): v = v.numpy() shape = tuple(None for it in value_shape) max_shape = ( max_dict[cur_path] or tuple(10000 for it in value_shape) if not isinstance(v, str) else (10000,) ) dtype = v.dtype.name if hasattr(v, "dtype") else type(v) dtype = "int64" if isinstance(v, str) else dtype d[k] = ( Tensor(shape=shape, dtype=dtype, max_shape=max_shape) if not isinstance(v, str) else Text(shape=(None,), dtype=dtype, max_shape=max_shape) ) return SchemaDict(d) my_schema = generate_schema(dataset) def transform_numpy(sample): d = {} for k, v in sample.items(): k = k.replace("/", "_") d[k] = transform_numpy(v) if isinstance(v, dict) else v return d @hub.transform(schema=my_schema, scheduler=scheduler, workers=workers) def my_transform(sample): return transform_numpy(sample) return my_transform(dataset)
def _from_tfds( dataset, split=None, num: int = -1, sampling_amount: int = 1, scheduler: str = "single", workers: int = 1, ): """| Converts a TFDS Dataset into hub format. Parameters ---------- dataset: str The name of the tfds dataset that needs to be converted into hub format split: str, optional A string representing the splits of the dataset that are required such as "train" or "test+train" If not present, all the splits of the dataset are used. num: int, optional The number of samples required. If not present, all the samples are taken. If count is -1, or if count is greater than the size of this dataset, the new dataset will contain all elements of this dataset. sampling_amount: float, optional a value from 0 to 1, that specifies how much of the dataset would be sampled to determinte feature shapes value of 0 would mean no sampling and 1 would imply that entire dataset would be sampled scheduler: str choice between "single", "threaded", "processed" workers: int how many threads or processes to use Examples -------- >>> out_ds = hub.Dataset.from_tfds('mnist', split='test+train', num=1000) >>> res_ds = out_ds.store("username/mnist") # res_ds is now a usable hub dataset """ try: import tensorflow_datasets as tfds global tfds except Exception: raise ModuleNotInstalledException("tensorflow_datasets") ds_info = tfds.load(dataset, with_info=True) if split is None: all_splits = ds_info[1].splits.keys() split = "+".join(all_splits) ds = tfds.load(dataset, split=split) ds = ds.take(num) max_dict = defaultdict(lambda: None) def sampling(ds): try: subset_len = len(ds) if hasattr(ds, "__len__") else num except Exception: subset_len = max(num, 5) subset_len = int(max(subset_len * sampling_amount, 5)) samples = ds.take(subset_len) for smp in samples: dict_sampling(smp) def dict_sampling(d, path=""): for k, v in d.items(): k = k.replace("/", "_") cur_path = path + "/" + k if isinstance(v, dict): dict_sampling(v) elif hasattr(v, "shape") and v.dtype != "string": if cur_path not in max_dict.keys(): max_dict[cur_path] = v.shape else: max_dict[cur_path] = tuple( [max(value) for value in zip(max_dict[cur_path], v.shape)] ) elif hasattr(v, "shape") and v.dtype == "string": if cur_path not in max_dict.keys(): max_dict[cur_path] = (len(v.numpy()),) else: max_dict[cur_path] = max(((len(v.numpy()),), max_dict[cur_path])) if sampling_amount > 0: sampling(ds) def generate_schema(ds): tf_schema = ds[1].features return to_hub(tf_schema).dict_ def to_hub(tf_dt, max_shape=None, path=""): if isinstance(tf_dt, tfds.features.FeaturesDict): return sdict_to_hub(tf_dt, path=path) elif isinstance(tf_dt, tfds.features.Image): return image_to_hub(tf_dt, max_shape=max_shape) elif isinstance(tf_dt, tfds.features.ClassLabel): return class_label_to_hub(tf_dt, max_shape=max_shape) elif isinstance(tf_dt, tfds.features.Video): return video_to_hub(tf_dt, max_shape=max_shape) elif isinstance(tf_dt, tfds.features.Text): return text_to_hub(tf_dt, max_shape=max_shape) elif isinstance(tf_dt, tfds.features.Sequence): return sequence_to_hub(tf_dt, max_shape=max_shape) elif isinstance(tf_dt, tfds.features.BBoxFeature): return bbox_to_hub(tf_dt, max_shape=max_shape) elif isinstance(tf_dt, tfds.features.Audio): return audio_to_hub(tf_dt, max_shape=max_shape) elif isinstance(tf_dt, tfds.features.Tensor): return tensor_to_hub(tf_dt, max_shape=max_shape) else: if tf_dt.dtype.name != "string": return tf_dt.dtype.name def sdict_to_hub(tf_dt, path=""): d = {} for key, value in tf_dt.items(): key = key.replace("/", "_") cur_path = path + "/" + key d[key] = to_hub(value, max_dict[cur_path], cur_path) return SchemaDict(d) def tensor_to_hub(tf_dt, max_shape=None): if tf_dt.dtype.name == "string": max_shape = max_shape or (100000,) return Text(shape=(None,), dtype="int64", max_shape=(100000,)) dt = tf_dt.dtype.name if max_shape and len(max_shape) > len(tf_dt.shape): max_shape = max_shape[(len(max_shape) - len(tf_dt.shape)) :] max_shape = max_shape or tuple( 10000 if dim is None else dim for dim in tf_dt.shape ) return Tensor(shape=tf_dt.shape, dtype=dt, max_shape=max_shape) def image_to_hub(tf_dt, max_shape=None): dt = tf_dt.dtype.name if max_shape and len(max_shape) > len(tf_dt.shape): max_shape = max_shape[(len(max_shape) - len(tf_dt.shape)) :] max_shape = max_shape or tuple( 10000 if dim is None else dim for dim in tf_dt.shape ) return Image( shape=tf_dt.shape, dtype=dt, max_shape=max_shape, # compressor="png" ) def class_label_to_hub(tf_dt, max_shape=None): if hasattr(tf_dt, "_num_classes"): return ClassLabel( num_classes=tf_dt.num_classes, ) else: return ClassLabel(names=tf_dt.names) def text_to_hub(tf_dt, max_shape=None): max_shape = max_shape or (100000,) dt = "int64" return Text(shape=(None,), dtype=dt, max_shape=max_shape) def bbox_to_hub(tf_dt, max_shape=None): dt = tf_dt.dtype.name return BBox(dtype=dt) def sequence_to_hub(tf_dt, max_shape=None): return Sequence(dtype=to_hub(tf_dt._feature), shape=()) def audio_to_hub(tf_dt, max_shape=None): if max_shape and len(max_shape) > len(tf_dt.shape): max_shape = max_shape[(len(max_shape) - len(tf_dt.shape)) :] max_shape = max_shape or tuple( 100000 if dim is None else dim for dim in tf_dt.shape ) dt = tf_dt.dtype.name return Audio( shape=tf_dt.shape, dtype=dt, max_shape=max_shape, file_format=tf_dt._file_format, sample_rate=tf_dt._sample_rate, ) def video_to_hub(tf_dt, max_shape=None): if max_shape and len(max_shape) > len(tf_dt.shape): max_shape = max_shape[(len(max_shape) - len(tf_dt.shape)) :] max_shape = max_shape or tuple( 10000 if dim is None else dim for dim in tf_dt.shape ) dt = tf_dt.dtype.name return Video(shape=tf_dt.shape, dtype=dt, max_shape=max_shape) my_schema = generate_schema(ds_info) def transform_numpy(sample): d = {} for k, v in sample.items(): k = k.replace("/", "_") d[k] = transform_numpy(v) if isinstance(v, dict) else v.numpy() return d @hub.transform(schema=my_schema, scheduler=scheduler, workers=workers) def my_transform(sample): return transform_numpy(sample) return my_transform(ds)
def _from_tensorflow(ds, scheduler: str = "single", workers: int = 1): """Converts a tensorflow dataset into hub format. Parameters ---------- dataset: The tensorflow dataset object that needs to be converted into hub format scheduler: str choice between "single", "threaded", "processed" workers: int how many threads or processes to use Examples -------- >>> ds = tf.data.Dataset.from_tensor_slices(tf.range(10)) >>> out_ds = hub.Dataset.from_tensorflow(ds) >>> res_ds = out_ds.store("username/new_dataset") # res_ds is now a usable hub dataset >>> ds = tf.data.Dataset.from_tensor_slices({'a': [1, 2], 'b': [5, 6]}) >>> out_ds = hub.Dataset.from_tensorflow(ds) >>> res_ds = out_ds.store("username/new_dataset") # res_ds is now a usable hub dataset >>> ds = hub.Dataset(schema=my_schema, shape=(1000,), url="username/dataset_name", mode="w") >>> ds = ds.to_tensorflow() >>> out_ds = hub.Dataset.from_tensorflow(ds) >>> res_ds = out_ds.store("username/new_dataset") # res_ds is now a usable hub dataset """ if "tensorflow" not in sys.modules: raise ModuleNotInstalledException("tensorflow") else: import tensorflow as tf global tf def generate_schema(ds): if isinstance(ds._structure, tf.TensorSpec): return tf_to_hub({"data": ds._structure}).dict_ return tf_to_hub(ds._structure).dict_ def tf_to_hub(tf_dt): if isinstance(tf_dt, dict): return dict_to_hub(tf_dt) elif isinstance(tf_dt, tf.TensorSpec): return TensorSpec_to_hub(tf_dt) def TensorSpec_to_hub(tf_dt): dt = tf_dt.dtype.name if tf_dt.dtype.name != "string" else "object" shape = tuple(tf_dt.shape) if tf_dt.shape.rank is not None else (None,) return Tensor(shape=shape, dtype=dt) def dict_to_hub(tf_dt): d = {key.replace("/", "_"): tf_to_hub(value) for key, value in tf_dt.items()} return SchemaDict(d) my_schema = generate_schema(ds) def transform_numpy(sample): d = {} for k, v in sample.items(): k = k.replace("/", "_") if not isinstance(v, dict): if isinstance(v, (tuple, list)): new_v = list(v) for i in range(len(new_v)): new_v[i] = new_v[i].numpy() d[k] = tuple(new_v) if isinstance(v, tuple) else new_v else: d[k] = v.numpy() else: d[k] = transform_numpy(v) return d @hub.transform(schema=my_schema, scheduler=scheduler, workers=workers) def my_transform(sample): sample = sample if isinstance(sample, dict) else {"data": sample} return transform_numpy(sample) return my_transform(ds)
def _to_tensorflow(dataset, indexes=None, include_shapes=False, key_list=None): """| Converts the dataset into a tensorflow compatible format Parameters ---------- indexes: list or int, optional The samples to be converted into tensorflow format. Takes all samples in dataset by default. include_shapes: boolean, optional False by default. Setting it to True passes the shapes to tf.data.Dataset.from_generator. Setting to True could lead to issues with dictionaries inside Tensors. """ try: import tensorflow as tf global tf except ModuleNotFoundError: raise ModuleNotInstalledException("tensorflow") key_list = key_list or list(dataset.keys) key_list = [key if key.startswith("/") else "/" + key for key in key_list] for key in key_list: if key not in dataset.keys: raise KeyError(key) indexes = indexes or dataset.indexes indexes = [indexes] if isinstance(indexes, int) else indexes _samples_in_chunks = { key: value.chunks[0] for key, value in dataset._tensors.items() } _active_chunks = {} _active_chunks_range = {} def _get_active_item(key, index): active_range = _active_chunks_range.get(key) samples_per_chunk = _samples_in_chunks[key] if active_range is None or index not in active_range: active_range_start = index - index % samples_per_chunk active_range = range( active_range_start, min(active_range_start + samples_per_chunk, indexes[-1] + 1), ) _active_chunks_range[key] = active_range _active_chunks[key] = dataset._tensors[key][ active_range.start : active_range.stop ] return _active_chunks[key][index % samples_per_chunk] def tf_gen(): key_dtype_map = {key: dataset[key, indexes[0]].dtype for key in dataset.keys} for index in indexes: d = {} for key in dataset.keys: if key not in key_list: continue split_key, cur = key.split("/"), d for i in range(1, len(split_key) - 1): if split_key[i] in cur.keys(): cur = cur[split_key[i]] else: cur[split_key[i]] = {} cur = cur[split_key[i]] cur[split_key[-1]] = _get_active_item(key, index) if isinstance(key_dtype_map[key], Text): value = cur[split_key[-1]] cur[split_key[-1]] = ( "".join(chr(it) for it in value.tolist()) if value.ndim == 1 else ["".join(chr(it) for it in val.tolist()) for val in value] ) yield (d) def dict_to_tf(my_dtype, path=""): d = {} for k, v in my_dtype.dict_.items(): for key in key_list: if key.startswith(path + "/" + k): d[k] = dtype_to_tf(v, path + "/" + k) break return d def tensor_to_tf(my_dtype): return dtype_to_tf(my_dtype.dtype) def text_to_tf(my_dtype): return "string" def dtype_to_tf(my_dtype, path=""): if isinstance(my_dtype, SchemaDict): return dict_to_tf(my_dtype, path=path) elif isinstance(my_dtype, Text): return text_to_tf(my_dtype) elif isinstance(my_dtype, Tensor): return tensor_to_tf(my_dtype) elif isinstance(my_dtype, Primitive): if str(my_dtype._dtype) == "object": return "string" return str(my_dtype._dtype) def get_output_shapes(my_dtype, path=""): if isinstance(my_dtype, SchemaDict): return output_shapes_from_dict(my_dtype, path=path) elif isinstance(my_dtype, (Text, Primitive)): return () elif isinstance(my_dtype, Tensor): return my_dtype.shape def output_shapes_from_dict(my_dtype, path=""): d = {} for k, v in my_dtype.dict_.items(): for key in key_list: if key.startswith(path + "/" + k): d[k] = get_output_shapes(v, path + "/" + k) break return d output_types = dtype_to_tf(dataset._schema) if include_shapes: output_shapes = get_output_shapes(dataset._schema) return tf.data.Dataset.from_generator( tf_gen, output_types=output_types, output_shapes=output_shapes ) else: return tf.data.Dataset.from_generator(tf_gen, output_types=output_types)
def to_tensorflow(self, indexes=None): """| Converts the dataset into a tensorflow compatible format Parameters ---------- offset: int, optional The offset from which dataset needs to be converted num_samples: int, optional The number of samples required of the dataset that needs to be converted """ if "tensorflow" not in sys.modules: raise ModuleNotInstalledException("tensorflow") else: import tensorflow as tf global tf indexes = indexes or self.indexes indexes = [indexes] if isinstance(indexes, int) else indexes def tf_gen(): for index in indexes: d = {} for key in self.keys: split_key = key.split("/") cur = d for i in range(1, len(split_key) - 1): if split_key[i] in cur.keys(): cur = cur[split_key[i]] else: cur[split_key[i]] = {} cur = cur[split_key[i]] cur[split_key[-1]] = self._tensors[key][index] yield (d) def dict_to_tf(my_dtype): d = {} for k, v in my_dtype.dict_.items(): d[k] = dtype_to_tf(v) return d def tensor_to_tf(my_dtype): return dtype_to_tf(my_dtype.dtype) def dtype_to_tf(my_dtype): if isinstance(my_dtype, SchemaDict): return dict_to_tf(my_dtype) elif isinstance(my_dtype, Tensor): return tensor_to_tf(my_dtype) elif isinstance(my_dtype, Primitive): if str(my_dtype._dtype) == "object": return "string" return str(my_dtype._dtype) def get_output_shapes(my_dtype): if isinstance(my_dtype, SchemaDict): return output_shapes_from_dict(my_dtype) elif isinstance(my_dtype, Tensor): return my_dtype.shape elif isinstance(my_dtype, Primitive): return () def output_shapes_from_dict(my_dtype): d = {} for k, v in my_dtype.dict_.items(): d[k] = get_output_shapes(v) return d output_types = dtype_to_tf(self._schema) output_shapes = get_output_shapes(self._schema) return tf.data.Dataset.from_generator(tf_gen, output_types=output_types, output_shapes=output_shapes)
def init( token: str = "", cloud=False, n_workers=1, memory_limit=None, processes=False, threads_per_worker=1, distributed=True, ): """Initializes cluster either local or on the cloud Parameters ---------- token: str token provided by snark cache: float Amount on local memory to cache locally, default 2e9 (2GB) cloud: bool Should be run locally or on the cloud n_workers: int number of concurrent workers, default to1 threads_per_worker: int Number of threads per each worker """ print("initialized") if "dask" not in sys.modules: raise ModuleNotInstalledException("dask") else: import dask from dask.distributed import Client global dask global Client global _client if _client is not None: _client.close() if cloud: raise NotImplementedError elif not distributed: client = None dask.config.set(scheduler="threading") hub.config.DISTRIBUTED = False else: n_workers = n_workers if n_workers is not None else psutil.cpu_count() memory_limit = (memory_limit if memory_limit is not None else psutil.virtual_memory().available) local_directory = os.path.join( os.path.expanduser("~"), ".activeloop", "tmp", ) if not os.path.exists(local_directory): os.makedirs(local_directory) client = Client( n_workers=n_workers, processes=processes, memory_limit=memory_limit, threads_per_worker=threads_per_worker, local_directory=local_directory, ) config.DISTRIBUTED = True _client = client return client
def _from_supervisely(project, scheduler: str = "single", workers: int = 1): try: import supervisely_lib as sly from supervisely_lib.project import project as sly_image_project from supervisely_lib.project import video_project as sly_video_project from skvideo.io import FFmpegReader, vread except ModuleNotFoundError: raise ModuleNotInstalledException("supervisely") if isinstance(project, str): with open(project + "meta.json") as meta_file: project_meta_dict = json.load(meta_file) instantiated = False else: project_meta_dict = project.meta.to_json() instantiated = True project_type = project_meta_dict["projectType"] mode = sly.OpenMode.READ def infer_image(paths): bboxes, masks = [], [] classes_bb, classes_mask = [], [] item_path, item_ann_path = paths ann = sly.Annotation.load_json_file(item_ann_path, project.meta) ann_dict = ann.to_json() sizes = (ann_dict["size"]["height"], ann_dict["size"]["width"]) for obj in ann_dict["objects"]: if obj["geometryType"] == "rectangle": bboxes.append([ item for sublist in obj["points"]["exterior"] for item in sublist ]) classes_bb.append(obj["classTitle"]) elif obj["geometryType"] == "polygon": img = PIL.Image.new("L", (sizes[1], sizes[0]), 0) PIL.ImageDraw.Draw(img).polygon( [tuple(obj) for obj in obj["points"]["exterior"]], outline=1, fill=1, ) masks.append(np.array(img)) classes_mask.append(obj["classTitle"]) return sizes, bboxes, masks, classes_bb, classes_mask def infer_video(paths): item_path, item_ann_path = paths vreader = FFmpegReader(item_path) return (vreader.getShape(), ) def infer_project(project, project_type, read_mode): if project_type == "images": if not instantiated: project = sly_image_project.Project(project, mode) max_shape = (0, 0) return ( project, Image, infer_image, max_shape, ) elif project_type == "videos": if not instantiated: project = sly_video_project.VideoProject(project, mode) max_shape = (0, 0, 0, 0) return ( project, Video, infer_video, max_shape, ) project, main_blob, infer_ds, max_shape = infer_project( project, project_type, mode) image_paths = [] label_names = [] max_num_bboxes = 0 max_num_polys = 0 masks = False datasets = project.datasets.items() uniform = True for ds in datasets: for i, item in enumerate(ds): path = ds.get_item_paths(item) image_paths.append(path) inf = infer_ds(path) if len(inf) > 1: if inf[3]: label_names.extend(inf[3]) if len(inf[3]) > max_num_bboxes: max_num_bboxes = len(inf[3]) if inf[4]: label_names.extend(inf[4]) if len(inf[4]) > max_num_polys: max_num_polys = len(inf[4]) if inf[2]: masks = True shape = inf[0] max_shape = np.maximum(shape, max_shape) if uniform and max_shape.any() and (shape != max_shape).any(): uniform = False label_names = list(np.unique(label_names)) items = chain(*datasets) idatasets = iter(datasets) ds, i = next(idatasets), 0 key = "shape" if uniform else "max_shape" if project_type == "images": read = sly.imaging.image.read blob_shape = {key: (*max_shape.tolist(), 3)} elif project_type == "videos": read = vread blob_shape = {key: max_shape.tolist()} if key == "max_shape": blob_shape["shape"] = (None, None, None, 3) schema = { project_type: main_blob(**blob_shape), } if max_num_bboxes: schema["bbox"] = BBox(shape=(None, 4), max_shape=(max_num_bboxes, 4)) if label_names: schema["label"] = ClassLabel( shape=(None, ), max_shape=(max(max_num_bboxes, max_num_polys), ), names=label_names, ) if masks: schema["mask"] = Mask(shape=(None, None, None), max_shape=(*max_shape.tolist(), 1)) @hub.transform(schema=schema, scheduler=scheduler, workers=workers) def transformation(item): nonlocal i, ds sample = {} if i >= len(ds): ds, i = next(idatasets), 0 item_path, item_ann_path = ds.get_item_paths(item) i += 1 _, bboxes, masks, classes_bbox, classes_mask = infer_ds( (item_path, item_ann_path)) sample[project_type] = read(item_path) if bboxes: sample["bbox"] = np.array(bboxes) sample["label"] = [label_names.index(i) for i in classes_bbox] if masks: sample["mask"] = np.expand_dims(masks[0], -1) sample["label"] = [label_names.index(i) for i in classes_mask] return sample return transformation(list(items))
def _to_tensorflow(dataset, indexes=None, include_shapes=False): """| Converts the dataset into a tensorflow compatible format Parameters ---------- indexes: list or int, optional The samples to be converted into tensorflow format. Takes all samples in dataset by default. include_shapes: boolean, optional False by default. Setting it to True passes the shapes to tf.data.Dataset.from_generator. Setting to True could lead to issues with dictionaries inside Tensors. """ try: import tensorflow as tf global tf except ModuleNotFoundError: raise ModuleNotInstalledException("tensorflow") indexes = indexes or dataset.indexes indexes = [indexes] if isinstance(indexes, int) else indexes _samples_in_chunks = { key: (None in value.shape) and 1 or value.chunks[0] for key, value in dataset._tensors.items() } _active_chunks = {} _active_chunks_range = {} def _get_active_item(key, index): active_range = _active_chunks_range.get(key) samples_per_chunk = _samples_in_chunks[key] if active_range is None or index not in active_range: active_range_start = index - index % samples_per_chunk active_range = range( active_range_start, active_range_start + samples_per_chunk ) _active_chunks_range[key] = active_range _active_chunks[key] = dataset._tensors[key][ active_range.start : active_range.stop ] return _active_chunks[key][index % samples_per_chunk] def tf_gen(): for index in indexes: d = {} for key in dataset.keys: split_key = key.split("/") cur = d for i in range(1, len(split_key) - 1): if split_key[i] in cur.keys(): cur = cur[split_key[i]] else: cur[split_key[i]] = {} cur = cur[split_key[i]] cur[split_key[-1]] = _get_active_item(key, index) yield (d) def dict_to_tf(my_dtype): d = {} for k, v in my_dtype.dict_.items(): d[k] = dtype_to_tf(v) return d def tensor_to_tf(my_dtype): return dtype_to_tf(my_dtype.dtype) def dtype_to_tf(my_dtype): if isinstance(my_dtype, SchemaDict): return dict_to_tf(my_dtype) elif isinstance(my_dtype, Tensor): return tensor_to_tf(my_dtype) elif isinstance(my_dtype, Primitive): if str(my_dtype._dtype) == "object": return "string" return str(my_dtype._dtype) def get_output_shapes(my_dtype): if isinstance(my_dtype, SchemaDict): return output_shapes_from_dict(my_dtype) elif isinstance(my_dtype, Tensor): return my_dtype.shape elif isinstance(my_dtype, Primitive): return () def output_shapes_from_dict(my_dtype): d = {} for k, v in my_dtype.dict_.items(): d[k] = get_output_shapes(v) return d output_types = dtype_to_tf(dataset._schema) if include_shapes: output_shapes = get_output_shapes(dataset._schema) return tf.data.Dataset.from_generator( tf_gen, output_types=output_types, output_shapes=output_shapes ) else: return tf.data.Dataset.from_generator(tf_gen, output_types=output_types)