Beispiel #1
0
    def commit(self, message: str = "") -> str:
        """| Saves the current state of the dataset and returns the commit id.
        Checks out automatically to an auto branch if the current commit is not the head of the branch

        Only saves the dataset without any version control information if the dataset was created before Hub v1.3.0

        Parameters
        ----------
        message: str, optional
            The commit message to store along with the commit
        """
        if self._commit_id is None:
            warnings.warn(
                "This dataset was created before version control, it does not support it. commit will behave same as flush"
            )
            self.flush()
        elif "r" in self._mode:
            raise ReadModeException("commit")
        else:
            self._auto_checkout()
            stored_commit_id = self._commit_id
            self._commit_id = generate_hash()
            new_node = VersionNode(self._commit_id, self._branch)
            self._version_node.insert(new_node, message)
            self._version_node = new_node
            self._branch_node_map[self._branch] = new_node
            self._commit_node_map[self._commit_id] = new_node
            self.flush()
            return stored_commit_id
Beispiel #2
0
    def checkout(self, address: str, create: bool = False) -> str:
        """| Changes the state of the dataset to the address mentioned. Creates a new branch if address isn't a commit id or branch name and create is True.
        Always checks out to the head of a branch if the address specified is a branch name.

        Returns the commit id of the commit that has been switched to.

        Only works if dataset was created on or after Hub v1.3.0

        Parameters
        ----------
        address: str
            The branch name or commit id to checkout to
        create: bool, optional
            Specifying create as True creates a new branch from the current commit if the address isn't an existing branch name or commit id
        """
        if self._commit_id is None:
            raise VersioningNotSupportedException("checkout")
        self.flush()
        if address in self._branch_node_map.keys():
            self._branch = address
            self._version_node = self._branch_node_map[address]
            self._commit_id = self._version_node.commit_id
        elif address in self._commit_node_map.keys():
            self._version_node = self._commit_node_map[address]
            self._branch = self._version_node.branch
            self._commit_id = self._version_node.commit_id
        elif create:
            if "r" in self._mode:
                raise ReadModeException("checkout to create new branch")
            self._branch = address
            new_commit_id = generate_hash()
            new_node = VersionNode(new_commit_id, self._branch)
            if not self._version_node.children:
                for key in self.keys:
                    self._tensors[key].fs_map.copy_all_chunks(
                        self._commit_id, new_commit_id
                    )
                if self._version_node.parent is not None:
                    self._version_node.parent.insert(
                        new_node, f"switched to new branch {address}"
                    )
            else:
                self._version_node.insert(new_node, f"switched to new branch {address}")
            self._version_node = new_node
            self._commit_id = new_commit_id
            self._branch_node_map[self._branch] = new_node
            self._commit_node_map[self._commit_id] = new_node
            self.flush()
        else:
            raise AddressNotFound(address)
        return self._commit_id
Beispiel #3
0
    def __init__(
        self,
        url: str,
        mode: str = None,
        shape=None,
        schema=None,
        token=None,
        fs=None,
        fs_map=None,
        meta_information=dict(),
        cache: int = defaults.DEFAULT_MEMORY_CACHE_SIZE,
        storage_cache: int = defaults.DEFAULT_STORAGE_CACHE_SIZE,
        lock_cache=True,
        tokenizer=None,
        lazy: bool = True,
        public: bool = True,
        name: str = None,
    ):
        """| Open a new or existing dataset for read/write

        Parameters
        ----------
        url: str
            The url where dataset is located/should be created
        mode: str, optional (default to "a")
            Python way to tell whether dataset is for read or write (ex. "r", "w", "a")
        shape: tuple, optional
            Tuple with (num_samples,) format, where num_samples is number of samples
        schema: optional
            Describes the data of a single sample. Hub schemas are used for that
            Required for 'a' and 'w' modes
        token: str or dict, optional
            If url is refering to a place where authorization is required,
            token is the parameter to pass the credentials, it can be filepath or dict
        fs: optional
        fs_map: optional
        meta_information: optional ,give information about dataset in a dictionary.
        cache: int, optional
            Size of the memory cache. Default is 64MB (2**26)
            if 0, False or None, then cache is not used
        storage_cache: int, optional
            Size of the storage cache. Default is 256MB (2**28)
            if 0, False or None, then storage cache is not used
        lock_cache: bool, optional
            Lock the cache for avoiding multiprocessing errors
        lazy: bool, optional
            Setting this to False will stop lazy computation and will allow items to be accessed without .compute()
        public: bool, optional
            only applicable if using hub storage, ignored otherwise
            setting this to False allows only the user who created it to access the dataset and
            the dataset won't be visible in the visualizer to the public
        name: str, optional
            only applicable when using hub storage, this is the name that shows up on the visualizer
        """

        shape = norm_shape(shape)
        if len(shape) != 1:
            raise ShapeLengthException()

        storage_cache = norm_cache(storage_cache) if cache else 0
        cache = norm_cache(cache)

        schema: SchemaDict = featurify(schema) if schema else None

        self._url = url
        self._token = token
        self.tokenizer = tokenizer
        self.lazy = lazy
        self._name = name

        self._fs, self._path = ((fs, url) if fs else get_fs_and_path(
            self._url, token=token, public=public))
        self._cache = cache
        self._storage_cache = storage_cache
        self.lock_cache = lock_cache
        self.verison = "1.x"
        mode = self._get_mode(mode, self._fs)
        self._mode = mode
        needcreate = self._check_and_prepare_dir()
        fs_map = fs_map or get_storage_map(self._fs,
                                           self._path,
                                           cache,
                                           lock=lock_cache,
                                           storage_cache=storage_cache)
        self._fs_map = fs_map
        self._meta_information = meta_information
        self.username = None
        self.dataset_name = None
        if not needcreate:
            self.meta = json.loads(fs_map[defaults.META_FILE].decode("utf-8"))
            self._name = self.meta.get("name") or None
            self._shape = tuple(self.meta["shape"])
            self._schema = hub.schema.deserialize.deserialize(
                self.meta["schema"])
            self._meta_information = self.meta.get("meta_info") or dict()
            self._flat_tensors = tuple(flatten(self._schema))
            try:
                version_info = pickle.loads(fs_map[defaults.VERSION_INFO])
                self._branch_node_map = version_info.get("branch_node_map")
                self._commit_node_map = version_info.get("commit_node_map")
                self._chunk_commit_map = version_info.get("chunk_commit_map")
                if not (self._branch_node_map and self._commit_node_map
                        and self._chunk_commit_map):
                    raise InvalidVersionInfoException()
                self._branch = "master"
                self._version_node = self._branch_node_map[self._branch]
                self._commit_id = self._version_node.commit_id
            except KeyError:
                self._commit_id = None
                self._branch = None
                self._version_node = None
                self._branch_node_map = None
                self._commit_node_map = None
                self._chunk_commit_map = None
            except InvalidVersionInfoException:
                self._commit_id = None
                self._branch = None
                self._version_node = None
                self._branch_node_map = None
                self._commit_node_map = None
                self._chunk_commit_map = None

            self._tensors = dict(self._open_storage_tensors())

            if shape != (None, ) and shape != self._shape:
                raise TypeError(
                    f"Shape in metafile [{self._shape}]  and shape in arguments [{shape}] are !=, use mode='w' to overwrite dataset"
                )
            if schema is not None and sorted(schema.dict_.keys()) != sorted(
                    self._schema.dict_.keys()):
                raise TypeError(
                    "Schema in metafile and schema in arguments do not match, use mode='w' to overwrite dataset"
                )

        else:
            if shape[0] is None:
                raise ShapeArgumentNotFoundException()
            if schema is None:
                raise SchemaArgumentNotFoundException()
            try:
                if shape is None:
                    raise ShapeArgumentNotFoundException()
                if schema is None:
                    raise SchemaArgumentNotFoundException()
                self._schema = schema
                self._shape = tuple(shape)
                self.meta = self._store_meta()
                self._meta_information = meta_information
                self._flat_tensors = tuple(flatten(self.schema))

                self._commit_id = generate_hash()
                self._branch = "master"
                self._version_node = VersionNode(self._commit_id, self._branch)
                self._branch_node_map = {self._branch: self._version_node}
                self._commit_node_map = {self._commit_id: self._version_node}
                self._chunk_commit_map = {
                    path: defaultdict(set)
                    for schema, path in self._flat_tensors
                }
                self._tensors = dict(self._generate_storage_tensors())
            except Exception as e:
                try:
                    self.close()
                except Exception:
                    pass
                self._fs.rm(self._path, recursive=True)
                logger.error("Deleting the dataset " + traceback.format_exc() +
                             str(e))
                raise
        self.flush()
        self.indexes = list(range(self._shape[0]))

        if self._path.startswith("s3://snark-hub-dev/"
                                 ) or self._path.startswith("s3://snark-hub/"):
            subpath = self._path[5:]
            spl = subpath.split("/")
            if len(spl) < 4:
                raise ValueError("Invalid Path for dataset")
            self.username = spl[-2]
            self.dataset_name = spl[-1]
            if needcreate:
                HubControlClient().create_dataset_entry(self.username,
                                                        self.dataset_name,
                                                        self.meta,
                                                        public=public)