def __delitem__(self, path): try: path = posixpath.join(self.bucketpath, path) self.s3fs.rm(path, recursive=True) except Exception as err: logger.error(err) raise S3Exception(err)
def create_dataset_entry(self, username, dataset_name, meta, public=True): try: tag = f"{username}/{dataset_name}" repo = f"public/{username}" if public else f"private/{username}" response = self.request( "POST", config.CREATE_DATASET_SUFFIX, json={ "tag": tag, "repository": repo, "public": public, "rewrite": True, }, endpoint=config.HUB_REST_ENDPOINT, ) if response.status_code == 200: logger.info( f"Your dataset is available at {config.HUB_REST_ENDPOINT}/datasets/explore?tag={tag}" ) if public is False: logger.info( "The dataset is private so make sure you are logged in!" ) except Exception as e: logger.error("Unable to create Dataset entry" + traceback.format_exc() + str(e))
def get_text(input): """ Converts strings stored as ascii value tensors back into strings """ if input.ndim == 1: try: text = "".join([chr(x) for x in input]).rstrip() return text except Exception as e: logger.error(traceback.format_exc() + str(e)) raise Exception( "get_text can only be called on a tensor of text or a batch of tensors of text" ) elif input.ndim == 2: try: text = [ "".join([chr(x) for x in sample]).rstrip() for sample in input ] return text except Exception as e: logger.error(traceback.format_exc() + str(e)) raise Exception( "get_text can only be called on a tensor of text or a batch of tensors of text" ) else: raise Exception( f"Got input of dimension {input.ndim} for get_text. Expected dimension of 1 or 2" )
def __call__(self, input): try: ds = {} # print(f"Image id: {input['image_id']}") ds["image_id"] = input["image_id"] info = input["info"] ds["image"] = np.empty(1, np.uint32) ds["image"][0] = np.array(Image.open( os.path.join( self._args.dataset_path, get_image_name(self._args, self._tag, input["image_id"]), )), dtype=np.uint32) ds["segmentation"] = np.empty(1, np.uint32) ds["area"] = np.empty(1, np.uint32) ds["iscrowd"] = np.empty(1, np.uint8) ds["bbox"] = np.empty(1, np.np.uint16) ds["category_id"] = np.empty(1, np.uint8) ds["id"] = np.empty(1, np.uint32) ds["segmentation"][0] = [anno["segmentation"] for anno in info] ds["area"][0] = [anno["area"] for anno in info] ds["iscrowd"][0] = [anno["iscrowd"] for anno in info] ds["bbox"][0] = [anno["bbox"] for anno in info] ds["category_id"][0] = [anno["category_id"] for anno in info] ds["id"][0] = [anno["id"] for anno in info] logger.info(f"Tag: {self._tag}, Index: {input['index']}") return ds except Exception as e: logger.error(e, exc_info=e, stack_info=True)
def exist_bucket(self, bucket): try: response = self.client.list_buckets() except ClientError as e: logger.error(e) return for bucket in response['Buckets']: if bucket["Name"] == bucket: return bucket return ''
def verify_cli_version(): try: version = pkg_resources.get_distribution(hub.__name__).version is_outdated, latest_version = check_outdated(hub.__name__, version) if is_outdated: print( "\033[93m" + "Hub is out of date. Please upgrade the package by running `pip3 install --upgrade snark`" + "\033[0m") except Exception as e: logger.error(str(e))
def lookup_hub_bucket(self): try: response = self.client.list_buckets() for bucket in response['Buckets']: if 'snark-hub-' in bucket["Name"]: return bucket["Name"] except ClientError as e: logger.error(e) return None
def delete_dataset_entry(self, username, dataset_name): try: tag = f"{username}/{dataset_name}" suffix = f"{config.DATASET_SUFFIX}/{tag}" self.request( "DELETE", suffix, endpoint=config.HUB_REST_ENDPOINT, ).json() except Exception as e: logger.error("Unable to delete Dataset entry" + traceback.format_exc() + str(e))
def __setitem__(self, path, content): try: path = posixpath.join(self.path, path) attrs = { "Bucket": self.bucket, "Body": content, "Key": path, "ContentType": ("application/octet-stream"), } self.client.put_object(**attrs) except Exception as err: logger.error(err) raise S3Exception(err)
def __call__(self, input): try: ds = {} n = 1 # for 1 row ds["image"] = np.empty(n, object) ds["dataset"] = np.empty(n, object) ds["isValidation"] = np.empty(n, object) ds["img_paths"] = np.empty(n, object) ds["img_width"] = np.empty(n, object) ds["img_height"] = np.empty(n, object) ds["objpos"] = np.empty(n, object) ds["joint_self"] = np.empty(n, object) ds["scale_provided"] = np.empty(n, object) ds["joint_others"] = np.empty(n, object) ds["scale_provided_other"] = np.empty(n, object) ds["objpos_other"] = np.empty(n, object) ds["annolist_index"] = np.empty(n, object) ds["people_index"] = np.empty(n, object) ds["numOtherPeople"] = np.empty(n, object) ds["image"][0] = np.array( Image.open( os.path.join(self._args.dataset_path, "images", input["img_paths"]))) ds["dataset"][0] = input["dataset"] ds["isValidation"][0] = input["isValidation"] ds["img_paths"][0] = input["img_paths"] ds["img_width"][0] = input["img_width"] ds["img_height"][0] = input["img_height"] """ Some features in input list has another list(more than one list) inside them. So they are converted to array using np.array(list(list()). """ ds["objpos"][0] = np.array(input["objpos"]) ds["joint_self"][0] = np.array(input["joint_self"]) ds["scale_provided"][0] = input["scale_provided"] ds["joint_others"][0] = np.array(input["joint_others"]) ds["scale_provided_other"][0] = np.array( input["scale_provided_other"]) ds["objpos_other"][0] = np.array(input["objpos_other"]) ds["annolist_index"][0] = input["annolist_index"] ds["people_index"][0] = input["people_index"] ds["numOtherPeople"][0] = input["numOtherPeople"] return ds except Exception as e: logger.error(e, exc_info=e, stack_info=True)
def _numpy_load(fs: fsspec.AbstractFileSystem, filepath: str, codec: BaseCodec) -> np.ndarray: """Given filesystem and filepath, loads numpy array""" # assert fs.exists( # filepath # ), f"Dataset file {filepath} does not exists. Your dataset data is likely to be corrupted" try: with fs.open(filepath, "rb") as f: return codec.decode(f.read()) except Exception as e: logger.error(traceback.format_exc() + str(e)) raise Exception( f"Dataset file {filepath} does not exists. Your dataset data is likely to be corrupted" )
def check_token(self, access_token): auth = f"Bearer {access_token}" response = self.request("GET", config.CHECK_TOKEN_REST_SUFFIX, headers={"Authorization": auth}) try: response_dict = response.json() is_valid = response_dict["is_valid"] except Exception as e: logger.error(f"Exception occured while validating token: {e}.") raise HubException("Error while validating the token. \ Please try logging in using username ans password." ) return is_valid
def __setitem__(self, path, content): self.check_update_creds() try: path = posixpath.join(self.path, path) content = bytearray(memoryview(content)) attrs = { "Bucket": self.bucket, "Body": content, "Key": path, "ContentType": ("application/octet-stream"), } self.client.put_object(**attrs) except Exception as err: logger.error(err) raise S3Exception(err)
def update_dataset_state(self, username, dataset_name, state, progress=0): try: tag = f"{username}/{dataset_name}" self.request( "POST", config.UPDATE_STATE_SUFFIX, json={ "tag": tag, "state": state, "progress": progress, }, endpoint=config.HUB_REST_ENDPOINT, ).json() except Exception as e: logger.error("Unable to update Dataset entry state " + traceback.format_exc() + str(e))
def store(self, tag, creds=None, session_creds=True) -> "Dataset": """ Stores dataset by tag(filepath) given credentials (can be omitted) """ fs, path = _load_fs_and_path(tag, creds, session_creds=session_creds) fs: fsspec.AbstractFileSystem = fs if ( fs.exists(path) and not fs.exists(f"{path}/meta.json") and not fs.exists(f"{path}/HUB_DATASET") and len(fs.ls(path, detail=False)) > 0 ): raise Exception(f"This path {path} is not a dataset path, tag: {tag}") self.delete(tag, creds) fs.makedirs(path) with fs.open(f"{path}/HUB_DATASET", "w") as f: f.write("Hello World") tensor_paths = [f"{path}/{t}" for t in self._tensors] for tensor_path in tensor_paths: fs.makedirs(tensor_path) tensor_meta = { name: _preprocess_meta_before_save(t._meta) for name, t in self._tensors.items() } count = self.count try: if count == -1: count = self._store_unknown_sized_ds(fs, path) else: self._store_known_sized_ds(fs, path) except PermissionError as e: logger.error(e) raise PermissionException(tag) for _, el in tensor_meta.items(): el["shape"] = (count,) + tuple(el["shape"][1:]) ds_meta = {"tensors": tensor_meta, "len": count} ds_info = dict() for key, value in self._metainfo.items(): ds_info[key] = value ds_meta["metainfo"] = ds_info with fs.open(f"{path}/meta.json", "w") as f: f.write(json.dumps(ds_meta, indent=2, sort_keys=True)) return load(tag, creds)
def __getitem__(self, path): try: path = posixpath.join(self.path, path) resp = self.client.get_object( Bucket=self.bucket, Key=path, ) x = resp["Body"].read() return x except ClientError as err: if err.response["Error"]["Code"] == "NoSuchKey": raise KeyError(err) else: raise except Exception as err: logger.error(err) raise S3Exception(err)
def get_access_token(self, username, password): response = self.request( "GET", config.GET_TOKEN_REST_SUFFIX, json={ "username": username, "password": password }, ) try: token_dict = response.json() token = token_dict["token"] except Exception as e: logger.error(f"Exception occured while getting token: {e}.") raise HubException("Error while loggin in. \ Please try logging in using access token.") return token
def create_dataset_entry(self, username, dataset_name, meta, public=True): try: tag = f"{username}/{dataset_name}" repo = f"public/{username}" if public else f"private/{username}" self.request( "POST", config.CREATE_DATASET_SUFFIX, json={ "tag": tag, "repository": repo, "public": public, "rewrite": True, }, endpoint=config.HUB_REST_ENDPOINT, ).json() except Exception as e: logger.error("Unable to create Dataset entry" + traceback.format_exc() + str(e))
def create_bucket(self, bucket_name, region=None): """Create an S3 bucket in a specified region If a region is not specified, the bucket is created in the S3 default region (us-east-1). :param bucket_name: Bucket to create :param region: String region to create bucket in, e.g., 'us-west-2' :return: True if bucket created, else False """ # Create bucket try: if region is None: self.client.create_bucket(Bucket=bucket_name) else: location = {'LocationConstraint': region} self.client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=location) except ClientError as e: logger.error(e) return False return True
def __init__( self, url: str, mode: str = None, shape=None, schema=None, token=None, fs=None, fs_map=None, meta_information=dict(), cache: int = defaults.DEFAULT_MEMORY_CACHE_SIZE, storage_cache: int = defaults.DEFAULT_STORAGE_CACHE_SIZE, lock_cache=True, tokenizer=None, lazy: bool = True, public: bool = True, name: str = None, ): """| Open a new or existing dataset for read/write Parameters ---------- url: str The url where dataset is located/should be created mode: str, optional (default to "a") Python way to tell whether dataset is for read or write (ex. "r", "w", "a") shape: tuple, optional Tuple with (num_samples,) format, where num_samples is number of samples schema: optional Describes the data of a single sample. Hub schemas are used for that Required for 'a' and 'w' modes token: str or dict, optional If url is refering to a place where authorization is required, token is the parameter to pass the credentials, it can be filepath or dict fs: optional fs_map: optional meta_information: optional ,give information about dataset in a dictionary. cache: int, optional Size of the memory cache. Default is 64MB (2**26) if 0, False or None, then cache is not used storage_cache: int, optional Size of the storage cache. Default is 256MB (2**28) if 0, False or None, then storage cache is not used lock_cache: bool, optional Lock the cache for avoiding multiprocessing errors lazy: bool, optional Setting this to False will stop lazy computation and will allow items to be accessed without .compute() public: bool, optional only applicable if using hub storage, ignored otherwise setting this to False allows only the user who created it to access the dataset and the dataset won't be visible in the visualizer to the public name: str, optional only applicable when using hub storage, this is the name that shows up on the visualizer """ shape = norm_shape(shape) if len(shape) != 1: raise ShapeLengthException() storage_cache = norm_cache(storage_cache) if cache else 0 cache = norm_cache(cache) schema: SchemaDict = featurify(schema) if schema else None self._url = url self._token = token self.tokenizer = tokenizer self.lazy = lazy self._name = name self._fs, self._path = ((fs, url) if fs else get_fs_and_path( self._url, token=token, public=public)) self._cache = cache self._storage_cache = storage_cache self.lock_cache = lock_cache self.verison = "1.x" mode = self._get_mode(mode, self._fs) self._mode = mode needcreate = self._check_and_prepare_dir() fs_map = fs_map or get_storage_map(self._fs, self._path, cache, lock=lock_cache, storage_cache=storage_cache) self._fs_map = fs_map self._meta_information = meta_information self.username = None self.dataset_name = None if not needcreate: self.meta = json.loads(fs_map["meta.json"].decode("utf-8")) self._name = self.meta.get("name") or None self._shape = tuple(self.meta["shape"]) self._schema = hub.schema.deserialize.deserialize( self.meta["schema"]) self._meta_information = self.meta.get("meta_info") or dict() self._flat_tensors = tuple(flatten(self._schema)) self._tensors = dict(self._open_storage_tensors()) if shape != (None, ) and shape != self._shape: raise TypeError( f"Shape in metafile [{self._shape}] and shape in arguments [{shape}] are !=, use mode='w' to overwrite dataset" ) if schema is not None and sorted(schema.dict_.keys()) != sorted( self._schema.dict_.keys()): raise TypeError( "Schema in metafile and schema in arguments do not match, use mode='w' to overwrite dataset" ) else: if shape[0] is None: raise ShapeArgumentNotFoundException() if schema is None: raise SchemaArgumentNotFoundException() try: if shape is None: raise ShapeArgumentNotFoundException() if schema is None: raise SchemaArgumentNotFoundException() self._schema = schema self._shape = tuple(shape) self.meta = self._store_meta() self._meta_information = meta_information self._flat_tensors = tuple(flatten(self.schema)) self._tensors = dict(self._generate_storage_tensors()) self.flush() except Exception as e: try: self.close() except Exception: pass self._fs.rm(self._path, recursive=True) logger.error("Deleting the dataset " + traceback.format_exc() + str(e)) raise self.indexes = list(range(self._shape[0])) if needcreate and (self._path.startswith("s3://snark-hub-dev/") or self._path.startswith("s3://snark-hub/")): subpath = self._path[5:] spl = subpath.split("/") if len(spl) < 4: raise ValueError("Invalid Path for dataset") self.username = spl[-2] self.dataset_name = spl[-1] HubControlClient().create_dataset_entry(self.username, self.dataset_name, self.meta, public=public)
def __init__( self, url: str, mode: str = "a", safe_mode: bool = False, shape=None, schema=None, token=None, fs=None, fs_map=None, cache: int = 2**26, storage_cache: int = 2**28, lock_cache=True, tokenizer=None, ): """| Open a new or existing dataset for read/write Parameters ---------- url: str The url where dataset is located/should be created mode: str, optional (default to "w") Python way to tell whether dataset is for read or write (ex. "r", "w", "a") safe_mode: bool, optional if dataset exists it cannot be rewritten in safe mode, otherwise it lets to write the first time shape: tuple, optional Tuple with (num_samples,) format, where num_samples is number of samples schema: optional Describes the data of a single sample. Hub schemas are used for that Required for 'a' and 'w' modes token: str or dict, optional If url is refering to a place where authorization is required, token is the parameter to pass the credentials, it can be filepath or dict fs: optional fs_map: optional cache: int, optional Size of the memory cache. Default is 64MB (2**26) if 0, False or None, then cache is not used storage_cache: int, optional Size of the storage cache. Default is 256MB (2**28) if 0, False or None, then storage cache is not used lock_cache: bool, optional Lock the cache for avoiding multiprocessing errors """ shape = shape or (None, ) if isinstance(shape, int): shape = [shape] if shape is not None: if len(tuple(shape)) != 1: raise ShapeLengthException if mode is None: raise NoneValueException("mode") if not cache: storage_cache = False self.url = url self.token = token self.mode = mode self.tokenizer = tokenizer self._fs, self._path = ((fs, url) if fs else get_fs_and_path( self.url, token=token)) self.cache = cache self._storage_cache = storage_cache self.lock_cache = lock_cache self.verison = "1.x" needcreate = self._check_and_prepare_dir() fs_map = fs_map or get_storage_map(self._fs, self._path, cache, lock=lock_cache, storage_cache=storage_cache) self._fs_map = fs_map if safe_mode and not needcreate: mode = "r" self.username = None self.dataset_name = None if not needcreate: self.meta = json.loads(fs_map["meta.json"].decode("utf-8")) self.shape = tuple(self.meta["shape"]) self.schema = hub.schema.deserialize.deserialize( self.meta["schema"]) self._flat_tensors = tuple(flatten(self.schema)) self._tensors = dict(self._open_storage_tensors()) else: if shape[0] is None: raise ShapeArgumentNotFoundException() if schema is None: raise SchemaArgumentNotFoundException() try: if shape is None: raise ShapeArgumentNotFoundException() if schema is None: raise SchemaArgumentNotFoundException() self.schema: HubSchema = featurify(schema) self.shape = tuple(shape) self.meta = self._store_meta() self._flat_tensors = tuple(flatten(self.schema)) self._tensors = dict(self._generate_storage_tensors()) self.flush() except Exception as e: try: self.close() except Exception: pass self._fs.rm(self._path, recursive=True) logger.error("Deleting the dataset " + traceback.format_exc() + str(e)) raise if needcreate and (self._path.startswith("s3://snark-hub-dev/") or self._path.startswith("s3://snark-hub/")): subpath = self._path[5:] spl = subpath.split("/") if len(spl) < 4: raise ValueError("Invalid Path for dataset") self.username = spl[-2] self.dataset_name = spl[-1] HubControlClient().create_dataset_entry(self.username, self.dataset_name, self.meta)