コード例 #1
0
def _connect(tag):
    """Connects to the backend and receives credentials"""

    creds = HubControlClient().get_config()
    dataset = HubControlClient().get_dataset_path(tag)

    # If dataset is in DB then return the path
    # Otherwise construct the path from the tag
    if dataset and "path" in dataset:
        path = dataset["path"]
    else:
        sub_tags = tag.split("/")
        # Get repository path from the cred location
        path = "/".join(creds["bucket"].split("/")[:-1])
        path = f"{path}/{sub_tags[0]}/{sub_tags[-1]}"
    return path, creds
コード例 #2
0
ファイル: auth.py プロジェクト: srikanthreddybethi/Hub
def login_fn(username, password):
    """ Logs in to Snark AI"""
    token = ""
    if token:
        logger.info("Token login.")
        logger.degug("Getting the token...")
        token = click.prompt(
            "Please paste the authentication token from {}".format(
                config.GET_TOKEN_REST_SUFFIX, type=str, hide_input=True))
        token = token.strip()
        AuthClient.check_token(token)
    else:
        logger.info(
            "Please log in using Activeloop credentials. You can register at https://app.activeloop.ai "
        )
        if not username:
            logger.debug("Prompting for username.")
            username = click.prompt("Username", type=str)
        username = username.strip()
        if not password:
            logger.debug("Prompting for password.")
            password = click.prompt("Password", type=str, hide_input=True)
        password = password.strip()
        token = AuthClient().get_access_token(username, password)
    TokenManager.set_token(token)
    HubControlClient().get_credentials()
    logger.info("Login Successful.")
コード例 #3
0
ファイル: store.py プロジェクト: xBugs-dot/Hub
def get_fs_and_path(url: str,
                    token=None,
                    public=True) -> Tuple[fsspec.AbstractFileSystem, str]:
    if url.startswith("s3://"):
        token = token or dict()
        token = read_aws_creds(token) if isinstance(token, str) else token
        return (
            S3FileSystemReplacement(
                key=token.get("aws_access_key_id"),
                secret=token.get("aws_secret_access_key"),
                token=token.get("aws_session_token"),
                client_kwargs={
                    "endpoint_url": token.get("endpoint_url"),
                    "region_name": token.get("region"),
                },
            ),
            url[5:],
        )
    elif url.startswith("gcs://"):
        return gcsfs.GCSFileSystem(token=token), url[6:]
    elif url.find("blob.core.windows.net/") != -1:
        account_name = url.split(".")[0]
        account_name = account_name[8:] if url.startswith(
            "https://") else account_name
        return (
            AzureBlobFileSystem(
                account_name=account_name,
                account_key=token.get("account_key"),
            ),
            url[url.find("blob.core.windows.net/") + 22:],
        )
    elif (url.startswith("../") or url.startswith("./") or url.startswith("/")
          or url.startswith("~/")):
        return fsspec.filesystem("file"), url
    elif (
            # windows local file system
            re.search("^[A-Za-z]:", url)):
        return fsspec.filesystem("file"), url
    else:
        # TOOD check if url is username/dataset:version
        if url.split("/")[0] == "google":
            org_id, ds_name = url.split("/")
            token, url = HubControlClient().get_dataset_credentials(
                org_id, ds_name)
            fs = gcsfs.GCSFileSystem(token=token)
            url = url[6:]
        else:
            url, creds = _connect(url, public=public)
            fs = S3FileSystemReplacement(
                expiration=creds["expiration"],
                key=creds["access_key"],
                secret=creds["secret_key"],
                token=creds["session_token"],
                client_kwargs={
                    "endpoint_url": creds["endpoint"],
                    "region_name": creds["region"],
                },
            )
        return (fs, url)
コード例 #4
0
ファイル: core.py プロジェクト: x213212/Hub
def _connect(tag):
    """Connects to the backend and receive credentials"""

    creds = HubControlClient().get_config()
    dataset = HubControlClient().get_dataset_path(tag)

    if dataset and "path" in dataset:
        path = dataset["path"]
    else:
        sub_tags = tag.split("/")
        real_tag = sub_tags[-1]
        if len(sub_tags) > 1 and sub_tags[0] != creds["_id"]:
            username = creds["bucket"].split("/")[-1]
            creds["bucket"] = creds["bucket"].replace(username, sub_tags[0])

        path = f"{creds['bucket']}/{real_tag}"
    return path, creds
コード例 #5
0
 def delete(self):
     fs, path = self._fs, self._path
     exist_meta = fs.exists(posixpath.join(path, "meta.json"))
     if exist_meta:
         fs.rm(path, recursive=True)
         if self.username is not None:
             HubControlClient().delete_dataset_entry(
                 self.username, self.dataset_name)
         return True
     return False
コード例 #6
0
ファイル: dataset.py プロジェクト: tqhdesilva/Hub
 def delete(self):
     """ Deletes the dataset """
     fs, path = self._fs, self._path
     exist_meta = fs.exists(posixpath.join(path, defaults.META_FILE))
     if exist_meta:
         fs.rm(path, recursive=True)
         if self.username is not None:
             HubControlClient().delete_dataset_entry(
                 self.username, self.dataset_name)
         return True
     return False
コード例 #7
0
ファイル: dataset.py プロジェクト: gavinljj/Hub-1
    def copy(self, dst_url: str, token=None, fs=None, public=True):
        """| Creates a copy of the dataset at the specified url and returns the dataset object
        Parameters
        ----------
        dst_url: str
            The destination url where dataset should be copied
        token: str or dict, optional
            If dst_url is refering to a place where authorization is required,
            token is the parameter to pass the credentials, it can be filepath or dict
        fs: optional
        public: bool, optional
            only applicable if using hub storage, ignored otherwise
            setting this to False allows only the user who created it to access the new copied dataset and
            the dataset won't be visible in the visualizer to the public
        """
        self.flush()
        destination = dst_url
        path = _copy_helper(
            dst_url=dst_url,
            token=token,
            fs=fs,
            public=public,
            src_url=self._path,
            src_fs=self._fs,
        )

        #  create entry in database if stored in hub storage
        if path.startswith("s3://snark-hub-dev/") or path.startswith(
                "s3://snark-hub/"):
            subpath = path[5:]
            spl = subpath.split("/")
            if len(spl) < 4:
                raise ValueError("Invalid Path for dataset")
            username = spl[-2]
            dataset_name = spl[-1]
            HubControlClient().create_dataset_entry(username,
                                                    dataset_name,
                                                    self.meta,
                                                    public=public)
        return hub.Dataset(destination, token=token, fs=fs, public=public)
コード例 #8
0
ファイル: s3_storage.py プロジェクト: xBugs-dot/Hub
 def check_update_creds(self):
     if self.expiration and float(self.expiration) < time.time():
         details = HubControlClient().get_credentials()
         self.expiration = details["expiration"]
         self.client = boto3.client(
             "s3",
             aws_access_key_id=details["access_key"],
             aws_secret_access_key=details["secret_key"],
             aws_session_token=details["session_token"],
             config=self.client_config,
             endpoint_url=self.endpoint_url,
             region_name=self.aws_region,
         )
         self.resource = boto3.resource(
             "s3",
             aws_access_key_id=details["access_key"],
             aws_secret_access_key=details["secret_key"],
             aws_session_token=details["session_token"],
             config=self.client_config,
             endpoint_url=self.endpoint_url,
             region_name=self.aws_region,
         )
コード例 #9
0
 def _update_dataset_state(self):
     if self.username is not None:
         HubControlClient().update_dataset_state(self.username,
                                                 self.dataset_name,
                                                 "UPLOADED")
コード例 #10
0
    def __init__(
        self,
        url: str,
        mode: str = None,
        shape=None,
        schema=None,
        token=None,
        fs=None,
        fs_map=None,
        meta_information=dict(),
        cache: int = defaults.DEFAULT_MEMORY_CACHE_SIZE,
        storage_cache: int = defaults.DEFAULT_STORAGE_CACHE_SIZE,
        lock_cache=True,
        tokenizer=None,
        lazy: bool = True,
        public: bool = True,
        name: str = None,
    ):
        """| Open a new or existing dataset for read/write

        Parameters
        ----------
        url: str
            The url where dataset is located/should be created
        mode: str, optional (default to "a")
            Python way to tell whether dataset is for read or write (ex. "r", "w", "a")
        shape: tuple, optional
            Tuple with (num_samples,) format, where num_samples is number of samples
        schema: optional
            Describes the data of a single sample. Hub schemas are used for that
            Required for 'a' and 'w' modes
        token: str or dict, optional
            If url is refering to a place where authorization is required,
            token is the parameter to pass the credentials, it can be filepath or dict
        fs: optional
        fs_map: optional
        meta_information: optional ,give information about dataset in a dictionary.
        cache: int, optional
            Size of the memory cache. Default is 64MB (2**26)
            if 0, False or None, then cache is not used
        storage_cache: int, optional
            Size of the storage cache. Default is 256MB (2**28)
            if 0, False or None, then storage cache is not used
        lock_cache: bool, optional
            Lock the cache for avoiding multiprocessing errors
        lazy: bool, optional
            Setting this to False will stop lazy computation and will allow items to be accessed without .compute()
        public: bool, optional
            only applicable if using hub storage, ignored otherwise
            setting this to False allows only the user who created it to access the dataset and
            the dataset won't be visible in the visualizer to the public
        name: str, optional
            only applicable when using hub storage, this is the name that shows up on the visualizer
        """

        shape = norm_shape(shape)
        if len(shape) != 1:
            raise ShapeLengthException()

        storage_cache = norm_cache(storage_cache) if cache else 0
        cache = norm_cache(cache)
        schema: SchemaDict = featurify(schema) if schema else None

        self._url = url
        self._token = token
        self.tokenizer = tokenizer
        self.lazy = lazy
        self._name = name

        self._fs, self._path = ((fs, url) if fs else get_fs_and_path(
            self._url, token=token, public=public))
        self._cache = cache
        self._storage_cache = storage_cache
        self.lock_cache = lock_cache
        self.verison = "1.x"
        mode = self._get_mode(mode, self._fs)
        self._mode = mode
        needcreate = self._check_and_prepare_dir()
        fs_map = fs_map or get_storage_map(self._fs,
                                           self._path,
                                           cache,
                                           lock=lock_cache,
                                           storage_cache=storage_cache)
        self._fs_map = fs_map
        self._meta_information = meta_information
        self.username = None
        self.dataset_name = None
        if not needcreate:
            self.meta = json.loads(fs_map["meta.json"].decode("utf-8"))
            self._name = self.meta.get("name") or None
            self._shape = tuple(self.meta["shape"])
            self._schema = hub.schema.deserialize.deserialize(
                self.meta["schema"])
            self._meta_information = self.meta.get("meta_info") or dict()
            self._flat_tensors = tuple(flatten(self._schema))
            self._tensors = dict(self._open_storage_tensors())
            if shape != (None, ) and shape != self._shape:
                raise TypeError(
                    f"Shape in metafile [{self._shape}]  and shape in arguments [{shape}] are !=, use mode='w' to overwrite dataset"
                )
            if schema is not None and sorted(schema.dict_.keys()) != sorted(
                    self._schema.dict_.keys()):
                raise TypeError(
                    "Schema in metafile and schema in arguments do not match, use mode='w' to overwrite dataset"
                )

        else:
            if shape[0] is None:
                raise ShapeArgumentNotFoundException()
            if schema is None:
                raise SchemaArgumentNotFoundException()
            try:
                if shape is None:
                    raise ShapeArgumentNotFoundException()
                if schema is None:
                    raise SchemaArgumentNotFoundException()
                self._schema = schema
                self._shape = tuple(shape)
                self.meta = self._store_meta()
                self._meta_information = meta_information
                self._flat_tensors = tuple(flatten(self.schema))
                self._tensors = dict(self._generate_storage_tensors())
                self.flush()
            except Exception as e:
                try:
                    self.close()
                except Exception:
                    pass
                self._fs.rm(self._path, recursive=True)
                logger.error("Deleting the dataset " + traceback.format_exc() +
                             str(e))
                raise

        self.indexes = list(range(self._shape[0]))

        if needcreate and (self._path.startswith("s3://snark-hub-dev/")
                           or self._path.startswith("s3://snark-hub/")):
            subpath = self._path[5:]
            spl = subpath.split("/")
            if len(spl) < 4:
                raise ValueError("Invalid Path for dataset")
            self.username = spl[-2]
            self.dataset_name = spl[-1]
            HubControlClient().create_dataset_entry(self.username,
                                                    self.dataset_name,
                                                    self.meta,
                                                    public=public)
コード例 #11
0
ファイル: store.py プロジェクト: xBugs-dot/Hub
def get_user_name():
    creds = HubControlClient().get_config()
    return creds["_id"]
コード例 #12
0
    def __init__(
        self,
        url: str,
        mode: str = "a",
        safe_mode: bool = False,
        shape=None,
        schema=None,
        token=None,
        fs=None,
        fs_map=None,
        cache: int = 2**26,
        storage_cache: int = 2**28,
        lock_cache=True,
        tokenizer=None,
    ):
        """| Open a new or existing dataset for read/write

        Parameters
        ----------
        url: str
            The url where dataset is located/should be created
        mode: str, optional (default to "w")
            Python way to tell whether dataset is for read or write (ex. "r", "w", "a")
        safe_mode: bool, optional
            if dataset exists it cannot be rewritten in safe mode, otherwise it lets to write the first time
        shape: tuple, optional
            Tuple with (num_samples,) format, where num_samples is number of samples
        schema: optional
            Describes the data of a single sample. Hub schemas are used for that
            Required for 'a' and 'w' modes
        token: str or dict, optional
            If url is refering to a place where authorization is required,
            token is the parameter to pass the credentials, it can be filepath or dict
        fs: optional
        fs_map: optional
        cache: int, optional
            Size of the memory cache. Default is 64MB (2**26)
            if 0, False or None, then cache is not used
        storage_cache: int, optional
            Size of the storage cache. Default is 256MB (2**28)
            if 0, False or None, then storage cache is not used
        lock_cache: bool, optional
            Lock the cache for avoiding multiprocessing errors
        """

        shape = shape or (None, )
        if isinstance(shape, int):
            shape = [shape]
        if shape is not None:
            if len(tuple(shape)) != 1:
                raise ShapeLengthException
        if mode is None:
            raise NoneValueException("mode")

        if not cache:
            storage_cache = False

        self.url = url
        self.token = token
        self.mode = mode
        self.tokenizer = tokenizer

        self._fs, self._path = ((fs, url) if fs else get_fs_and_path(
            self.url, token=token))
        self.cache = cache
        self._storage_cache = storage_cache
        self.lock_cache = lock_cache
        self.verison = "1.x"

        needcreate = self._check_and_prepare_dir()
        fs_map = fs_map or get_storage_map(self._fs,
                                           self._path,
                                           cache,
                                           lock=lock_cache,
                                           storage_cache=storage_cache)
        self._fs_map = fs_map

        if safe_mode and not needcreate:
            mode = "r"
        self.username = None
        self.dataset_name = None
        if not needcreate:
            self.meta = json.loads(fs_map["meta.json"].decode("utf-8"))
            self.shape = tuple(self.meta["shape"])
            self.schema = hub.schema.deserialize.deserialize(
                self.meta["schema"])
            self._flat_tensors = tuple(flatten(self.schema))
            self._tensors = dict(self._open_storage_tensors())
        else:
            if shape[0] is None:
                raise ShapeArgumentNotFoundException()
            if schema is None:
                raise SchemaArgumentNotFoundException()
            try:
                if shape is None:
                    raise ShapeArgumentNotFoundException()
                if schema is None:
                    raise SchemaArgumentNotFoundException()
                self.schema: HubSchema = featurify(schema)
                self.shape = tuple(shape)
                self.meta = self._store_meta()
                self._flat_tensors = tuple(flatten(self.schema))
                self._tensors = dict(self._generate_storage_tensors())
                self.flush()
            except Exception as e:
                try:
                    self.close()
                except Exception:
                    pass
                self._fs.rm(self._path, recursive=True)
                logger.error("Deleting the dataset " + traceback.format_exc() +
                             str(e))
                raise

        if needcreate and (self._path.startswith("s3://snark-hub-dev/")
                           or self._path.startswith("s3://snark-hub/")):
            subpath = self._path[5:]
            spl = subpath.split("/")
            if len(spl) < 4:
                raise ValueError("Invalid Path for dataset")
            self.username = spl[-2]
            self.dataset_name = spl[-1]
            HubControlClient().create_dataset_entry(self.username,
                                                    self.dataset_name,
                                                    self.meta)