def _connect(tag): """Connects to the backend and receives credentials""" creds = HubControlClient().get_config() dataset = HubControlClient().get_dataset_path(tag) # If dataset is in DB then return the path # Otherwise construct the path from the tag if dataset and "path" in dataset: path = dataset["path"] else: sub_tags = tag.split("/") # Get repository path from the cred location path = "/".join(creds["bucket"].split("/")[:-1]) path = f"{path}/{sub_tags[0]}/{sub_tags[-1]}" return path, creds
def login_fn(username, password): """ Logs in to Snark AI""" token = "" if token: logger.info("Token login.") logger.degug("Getting the token...") token = click.prompt( "Please paste the authentication token from {}".format( config.GET_TOKEN_REST_SUFFIX, type=str, hide_input=True)) token = token.strip() AuthClient.check_token(token) else: logger.info( "Please log in using Activeloop credentials. You can register at https://app.activeloop.ai " ) if not username: logger.debug("Prompting for username.") username = click.prompt("Username", type=str) username = username.strip() if not password: logger.debug("Prompting for password.") password = click.prompt("Password", type=str, hide_input=True) password = password.strip() token = AuthClient().get_access_token(username, password) TokenManager.set_token(token) HubControlClient().get_credentials() logger.info("Login Successful.")
def get_fs_and_path(url: str, token=None, public=True) -> Tuple[fsspec.AbstractFileSystem, str]: if url.startswith("s3://"): token = token or dict() token = read_aws_creds(token) if isinstance(token, str) else token return ( S3FileSystemReplacement( key=token.get("aws_access_key_id"), secret=token.get("aws_secret_access_key"), token=token.get("aws_session_token"), client_kwargs={ "endpoint_url": token.get("endpoint_url"), "region_name": token.get("region"), }, ), url[5:], ) elif url.startswith("gcs://"): return gcsfs.GCSFileSystem(token=token), url[6:] elif url.find("blob.core.windows.net/") != -1: account_name = url.split(".")[0] account_name = account_name[8:] if url.startswith( "https://") else account_name return ( AzureBlobFileSystem( account_name=account_name, account_key=token.get("account_key"), ), url[url.find("blob.core.windows.net/") + 22:], ) elif (url.startswith("../") or url.startswith("./") or url.startswith("/") or url.startswith("~/")): return fsspec.filesystem("file"), url elif ( # windows local file system re.search("^[A-Za-z]:", url)): return fsspec.filesystem("file"), url else: # TOOD check if url is username/dataset:version if url.split("/")[0] == "google": org_id, ds_name = url.split("/") token, url = HubControlClient().get_dataset_credentials( org_id, ds_name) fs = gcsfs.GCSFileSystem(token=token) url = url[6:] else: url, creds = _connect(url, public=public) fs = S3FileSystemReplacement( expiration=creds["expiration"], key=creds["access_key"], secret=creds["secret_key"], token=creds["session_token"], client_kwargs={ "endpoint_url": creds["endpoint"], "region_name": creds["region"], }, ) return (fs, url)
def _connect(tag): """Connects to the backend and receive credentials""" creds = HubControlClient().get_config() dataset = HubControlClient().get_dataset_path(tag) if dataset and "path" in dataset: path = dataset["path"] else: sub_tags = tag.split("/") real_tag = sub_tags[-1] if len(sub_tags) > 1 and sub_tags[0] != creds["_id"]: username = creds["bucket"].split("/")[-1] creds["bucket"] = creds["bucket"].replace(username, sub_tags[0]) path = f"{creds['bucket']}/{real_tag}" return path, creds
def delete(self): fs, path = self._fs, self._path exist_meta = fs.exists(posixpath.join(path, "meta.json")) if exist_meta: fs.rm(path, recursive=True) if self.username is not None: HubControlClient().delete_dataset_entry( self.username, self.dataset_name) return True return False
def delete(self): """ Deletes the dataset """ fs, path = self._fs, self._path exist_meta = fs.exists(posixpath.join(path, defaults.META_FILE)) if exist_meta: fs.rm(path, recursive=True) if self.username is not None: HubControlClient().delete_dataset_entry( self.username, self.dataset_name) return True return False
def copy(self, dst_url: str, token=None, fs=None, public=True): """| Creates a copy of the dataset at the specified url and returns the dataset object Parameters ---------- dst_url: str The destination url where dataset should be copied token: str or dict, optional If dst_url is refering to a place where authorization is required, token is the parameter to pass the credentials, it can be filepath or dict fs: optional public: bool, optional only applicable if using hub storage, ignored otherwise setting this to False allows only the user who created it to access the new copied dataset and the dataset won't be visible in the visualizer to the public """ self.flush() destination = dst_url path = _copy_helper( dst_url=dst_url, token=token, fs=fs, public=public, src_url=self._path, src_fs=self._fs, ) # create entry in database if stored in hub storage if path.startswith("s3://snark-hub-dev/") or path.startswith( "s3://snark-hub/"): subpath = path[5:] spl = subpath.split("/") if len(spl) < 4: raise ValueError("Invalid Path for dataset") username = spl[-2] dataset_name = spl[-1] HubControlClient().create_dataset_entry(username, dataset_name, self.meta, public=public) return hub.Dataset(destination, token=token, fs=fs, public=public)
def check_update_creds(self): if self.expiration and float(self.expiration) < time.time(): details = HubControlClient().get_credentials() self.expiration = details["expiration"] self.client = boto3.client( "s3", aws_access_key_id=details["access_key"], aws_secret_access_key=details["secret_key"], aws_session_token=details["session_token"], config=self.client_config, endpoint_url=self.endpoint_url, region_name=self.aws_region, ) self.resource = boto3.resource( "s3", aws_access_key_id=details["access_key"], aws_secret_access_key=details["secret_key"], aws_session_token=details["session_token"], config=self.client_config, endpoint_url=self.endpoint_url, region_name=self.aws_region, )
def _update_dataset_state(self): if self.username is not None: HubControlClient().update_dataset_state(self.username, self.dataset_name, "UPLOADED")
def __init__( self, url: str, mode: str = None, shape=None, schema=None, token=None, fs=None, fs_map=None, meta_information=dict(), cache: int = defaults.DEFAULT_MEMORY_CACHE_SIZE, storage_cache: int = defaults.DEFAULT_STORAGE_CACHE_SIZE, lock_cache=True, tokenizer=None, lazy: bool = True, public: bool = True, name: str = None, ): """| Open a new or existing dataset for read/write Parameters ---------- url: str The url where dataset is located/should be created mode: str, optional (default to "a") Python way to tell whether dataset is for read or write (ex. "r", "w", "a") shape: tuple, optional Tuple with (num_samples,) format, where num_samples is number of samples schema: optional Describes the data of a single sample. Hub schemas are used for that Required for 'a' and 'w' modes token: str or dict, optional If url is refering to a place where authorization is required, token is the parameter to pass the credentials, it can be filepath or dict fs: optional fs_map: optional meta_information: optional ,give information about dataset in a dictionary. cache: int, optional Size of the memory cache. Default is 64MB (2**26) if 0, False or None, then cache is not used storage_cache: int, optional Size of the storage cache. Default is 256MB (2**28) if 0, False or None, then storage cache is not used lock_cache: bool, optional Lock the cache for avoiding multiprocessing errors lazy: bool, optional Setting this to False will stop lazy computation and will allow items to be accessed without .compute() public: bool, optional only applicable if using hub storage, ignored otherwise setting this to False allows only the user who created it to access the dataset and the dataset won't be visible in the visualizer to the public name: str, optional only applicable when using hub storage, this is the name that shows up on the visualizer """ shape = norm_shape(shape) if len(shape) != 1: raise ShapeLengthException() storage_cache = norm_cache(storage_cache) if cache else 0 cache = norm_cache(cache) schema: SchemaDict = featurify(schema) if schema else None self._url = url self._token = token self.tokenizer = tokenizer self.lazy = lazy self._name = name self._fs, self._path = ((fs, url) if fs else get_fs_and_path( self._url, token=token, public=public)) self._cache = cache self._storage_cache = storage_cache self.lock_cache = lock_cache self.verison = "1.x" mode = self._get_mode(mode, self._fs) self._mode = mode needcreate = self._check_and_prepare_dir() fs_map = fs_map or get_storage_map(self._fs, self._path, cache, lock=lock_cache, storage_cache=storage_cache) self._fs_map = fs_map self._meta_information = meta_information self.username = None self.dataset_name = None if not needcreate: self.meta = json.loads(fs_map["meta.json"].decode("utf-8")) self._name = self.meta.get("name") or None self._shape = tuple(self.meta["shape"]) self._schema = hub.schema.deserialize.deserialize( self.meta["schema"]) self._meta_information = self.meta.get("meta_info") or dict() self._flat_tensors = tuple(flatten(self._schema)) self._tensors = dict(self._open_storage_tensors()) if shape != (None, ) and shape != self._shape: raise TypeError( f"Shape in metafile [{self._shape}] and shape in arguments [{shape}] are !=, use mode='w' to overwrite dataset" ) if schema is not None and sorted(schema.dict_.keys()) != sorted( self._schema.dict_.keys()): raise TypeError( "Schema in metafile and schema in arguments do not match, use mode='w' to overwrite dataset" ) else: if shape[0] is None: raise ShapeArgumentNotFoundException() if schema is None: raise SchemaArgumentNotFoundException() try: if shape is None: raise ShapeArgumentNotFoundException() if schema is None: raise SchemaArgumentNotFoundException() self._schema = schema self._shape = tuple(shape) self.meta = self._store_meta() self._meta_information = meta_information self._flat_tensors = tuple(flatten(self.schema)) self._tensors = dict(self._generate_storage_tensors()) self.flush() except Exception as e: try: self.close() except Exception: pass self._fs.rm(self._path, recursive=True) logger.error("Deleting the dataset " + traceback.format_exc() + str(e)) raise self.indexes = list(range(self._shape[0])) if needcreate and (self._path.startswith("s3://snark-hub-dev/") or self._path.startswith("s3://snark-hub/")): subpath = self._path[5:] spl = subpath.split("/") if len(spl) < 4: raise ValueError("Invalid Path for dataset") self.username = spl[-2] self.dataset_name = spl[-1] HubControlClient().create_dataset_entry(self.username, self.dataset_name, self.meta, public=public)
def get_user_name(): creds = HubControlClient().get_config() return creds["_id"]
def __init__( self, url: str, mode: str = "a", safe_mode: bool = False, shape=None, schema=None, token=None, fs=None, fs_map=None, cache: int = 2**26, storage_cache: int = 2**28, lock_cache=True, tokenizer=None, ): """| Open a new or existing dataset for read/write Parameters ---------- url: str The url where dataset is located/should be created mode: str, optional (default to "w") Python way to tell whether dataset is for read or write (ex. "r", "w", "a") safe_mode: bool, optional if dataset exists it cannot be rewritten in safe mode, otherwise it lets to write the first time shape: tuple, optional Tuple with (num_samples,) format, where num_samples is number of samples schema: optional Describes the data of a single sample. Hub schemas are used for that Required for 'a' and 'w' modes token: str or dict, optional If url is refering to a place where authorization is required, token is the parameter to pass the credentials, it can be filepath or dict fs: optional fs_map: optional cache: int, optional Size of the memory cache. Default is 64MB (2**26) if 0, False or None, then cache is not used storage_cache: int, optional Size of the storage cache. Default is 256MB (2**28) if 0, False or None, then storage cache is not used lock_cache: bool, optional Lock the cache for avoiding multiprocessing errors """ shape = shape or (None, ) if isinstance(shape, int): shape = [shape] if shape is not None: if len(tuple(shape)) != 1: raise ShapeLengthException if mode is None: raise NoneValueException("mode") if not cache: storage_cache = False self.url = url self.token = token self.mode = mode self.tokenizer = tokenizer self._fs, self._path = ((fs, url) if fs else get_fs_and_path( self.url, token=token)) self.cache = cache self._storage_cache = storage_cache self.lock_cache = lock_cache self.verison = "1.x" needcreate = self._check_and_prepare_dir() fs_map = fs_map or get_storage_map(self._fs, self._path, cache, lock=lock_cache, storage_cache=storage_cache) self._fs_map = fs_map if safe_mode and not needcreate: mode = "r" self.username = None self.dataset_name = None if not needcreate: self.meta = json.loads(fs_map["meta.json"].decode("utf-8")) self.shape = tuple(self.meta["shape"]) self.schema = hub.schema.deserialize.deserialize( self.meta["schema"]) self._flat_tensors = tuple(flatten(self.schema)) self._tensors = dict(self._open_storage_tensors()) else: if shape[0] is None: raise ShapeArgumentNotFoundException() if schema is None: raise SchemaArgumentNotFoundException() try: if shape is None: raise ShapeArgumentNotFoundException() if schema is None: raise SchemaArgumentNotFoundException() self.schema: HubSchema = featurify(schema) self.shape = tuple(shape) self.meta = self._store_meta() self._flat_tensors = tuple(flatten(self.schema)) self._tensors = dict(self._generate_storage_tensors()) self.flush() except Exception as e: try: self.close() except Exception: pass self._fs.rm(self._path, recursive=True) logger.error("Deleting the dataset " + traceback.format_exc() + str(e)) raise if needcreate and (self._path.startswith("s3://snark-hub-dev/") or self._path.startswith("s3://snark-hub/")): subpath = self._path[5:] spl = subpath.split("/") if len(spl) < 4: raise ValueError("Invalid Path for dataset") self.username = spl[-2] self.dataset_name = spl[-1] HubControlClient().create_dataset_entry(self.username, self.dataset_name, self.meta)