Esempio n. 1
0
    def collect_granular(self,
                         target=None,
                         with_deps=False,
                         recursive=False,
                         graph=None):
        """
        Priority is in the order of following in case of ambiguity:
            - .dvc file or .yaml file
            - dir if recursive and directory exists
            - stage_name
            - output file
        """
        if not target:
            return [(stage, None) for stage in self.stages]

        file, name = parse_target(target)
        stages = []

        # Optimization: do not collect the graph for a specific target
        if not file:
            # parsing is ambiguous when it does not have a colon
            # or if it's not a dvcfile, as it can be a stage name
            # in `dvc.yaml` or, an output in a stage.
            logger.debug("Checking if stage '%s' is in '%s'", target,
                         PIPELINE_FILE)
            if not (recursive and os.path.isdir(target)):
                stage = self._collect_from_default_dvcfile(target)
                if stage:
                    stages = (self._collect_pipeline(stage)
                              if with_deps else [stage])
        elif not with_deps and is_valid_filename(file):
            stages = self.get_stages(file, name)

        if not stages:
            if not (recursive and os.path.isdir(target)):
                try:
                    (out, ) = self.find_outs_by_path(target, strict=False)
                    filter_info = PathInfo(os.path.abspath(target))
                    return [(out.stage, filter_info)]
                except OutputNotFoundError:
                    pass

            try:
                stages = self.collect(target, with_deps, recursive, graph)
            except StageFileDoesNotExistError as exc:
                # collect() might try to use `target` as a stage name
                # and throw error that dvc.yaml does not exist, whereas it
                # should say that both stage name and file does not exist.
                if file and is_valid_filename(file):
                    raise
                raise NoOutputOrStageError(target, exc.file) from exc
            except StageNotFound as exc:
                raise NoOutputOrStageError(target, exc.file) from exc

        return [(stage, None) for stage in stages]
Esempio n. 2
0
def _collect_specific_target(
    loader: "StageLoad",
    target: str,
    with_deps: bool,
    recursive: bool,
    accept_group: bool,
) -> Tuple[StageIter, "OptStr", "OptStr"]:
    from dvc.dvcfile import is_valid_filename

    # Optimization: do not collect the graph for a specific target
    file, name = parse_target(target)

    # if the target has a file, we can load directly from it.
    if not file:
        # but, if there's no file, parsing is ambiguous as it can be a
        # stage name in `dvc.yaml` file or an output. We prioritize
        # `dvc.yaml` stage name here. If it exists, then we move on.
        # else, we assume it's a output name in the `collect_granular()` below
        msg = "Checking if stage '%s' is in '%s'"
        logger.debug(msg, target, PIPELINE_FILE)
        if not (recursive and loader.tree.isdir(target)):
            stages = _maybe_collect_from_dvc_yaml(
                loader,
                target,
                with_deps,
                accept_group=accept_group,
            )
            if stages:
                return stages, file, name
    elif not with_deps and is_valid_filename(file):
        stages = loader.load_all(file, name, accept_group=accept_group)
        return stages, file, name
    return [], file, name
Esempio n. 3
0
def get(url, path, out=None, rev=None):
    from dvc.external_repo import external_repo
    from dvc.dvcfile import is_valid_filename

    out = resolve_output(path, out)

    if is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
    try:
        with external_repo(url=url, rev=rev) as repo:
            if hasattr(repo, "cache"):
                repo.cache.local.cache_dir = tmp_dir

                # Try any links possible to avoid data duplication.
                #
                # Not using symlink, because we need to remove cache after we
                # are done, and to make that work we would have to copy data
                # over anyway before removing the cache, so we might just copy
                # it right away.
                #
                # Also, we can't use theoretical "move" link type here, because
                # the same cache file might be used a few times in a directory.
                repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

            repo.pull_to(path, PathInfo(out))
    finally:
        remove(tmp_dir)
Esempio n. 4
0
File: repo.py Progetto: skshetry/dvc
        def _func(fname):
            if dvcfiles:
                return True

            return not (
                is_valid_filename(fname) or fname == DvcIgnore.DVCIGNORE_FILE
            )
Esempio n. 5
0
def parse_target(target, default=None):
    from dvc.dvcfile import PIPELINE_FILE, PIPELINE_LOCK, is_valid_filename
    from dvc.exceptions import DvcException

    if not target:
        return None, None

    match = TARGET_REGEX.match(target)
    if not match:
        return target, None

    path, name = (
        match.group("path"),
        match.group("name"),
    )
    if path:
        if os.path.basename(path) == PIPELINE_LOCK:
            raise DvcException(
                "Did you mean: `{}`?".format(
                    target.replace(".lock", ".yaml", 1)
                )
            )
        if not name:
            ret = (target, None)
            return ret if is_valid_filename(target) else ret[::-1]

    if not path:
        path = default or PIPELINE_FILE
        logger.debug("Assuming file to be '%s'", path)

    return path, name
Esempio n. 6
0
    def collect_granular(
        self,
        target: str = None,
        with_deps: bool = False,
        recursive: bool = False,
        graph: "DiGraph" = None,
        accept_group: bool = False,
    ) -> List[StageInfo]:
        """Collects a list of (stage, filter_info) from the given target.

        Priority is in the order of following in case of ambiguity:
        - .dvc file or .yaml file
        - dir if recursive and directory exists
        - stage_name
        - output file

        Args:
            target: if not provided, all of the stages without any filters are
                returned.
                If `target` is a path to a dvc-tracked output,
                a (stage, output_path_info) is returned.
                Otherwise, the details above for `target` in `collect()`
                applies.

            (see `collect()` for other arguments)
        """
        if not target:
            return [StageInfo(stage) for stage in self.repo.stages]

        stages, file, _ = _collect_specific_target(self, target, with_deps,
                                                   recursive, accept_group)
        if not stages:
            if not (recursive and self.tree.isdir(target)):
                try:
                    (out, ) = self.repo.find_outs_by_path(target, strict=False)
                    filter_info = PathInfo(os.path.abspath(target))
                    return [StageInfo(out.stage, filter_info)]
                except OutputNotFoundError:
                    pass

            try:
                stages = self.collect(
                    target,
                    with_deps,
                    recursive,
                    graph,
                    accept_group=accept_group,
                )
            except StageFileDoesNotExistError as exc:
                # collect() might try to use `target` as a stage name
                # and throw error that dvc.yaml does not exist, whereas it
                # should say that both stage name and file does not exist.
                if file and is_valid_filename(file):
                    raise
                raise NoOutputOrStageError(target, exc.file) from exc
            except StageNotFound as exc:
                raise NoOutputOrStageError(target, exc.file) from exc

        return [StageInfo(stage) for stage in stages]
Esempio n. 7
0
        def _func(fname):
            from dvc.dvcfile import is_valid_filename
            from dvc.ignore import DvcIgnore

            if dvcfiles:
                return True

            return not (is_valid_filename(fname)
                        or fname == DvcIgnore.DVCIGNORE_FILE)
Esempio n. 8
0
def get(url, path, out=None, rev=None, jobs=None):
    import shortuuid

    from dvc.dvcfile import is_valid_filename
    from dvc.external_repo import external_repo
    from dvc.fs.callbacks import Callback

    out = resolve_output(path, out)

    if is_valid_filename(out):
        raise GetDVCFileError()

    # Creating a directory right beside the output to make sure that they
    # are on the same filesystem, so we could take the advantage of
    # reflink and/or hardlink. Not using tempfile.TemporaryDirectory
    # because it will create a symlink to tmpfs, which defeats the purpose
    # and won't work with reflink/hardlink.
    dpath = os.path.dirname(os.path.abspath(out))
    tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))

    # Try any links possible to avoid data duplication.
    #
    # Not using symlink, because we need to remove cache after we
    # are done, and to make that work we would have to copy data
    # over anyway before removing the cache, so we might just copy
    # it right away.
    #
    # Also, we can't use theoretical "move" link type here, because
    # the same cache file might be used a few times in a directory.
    cache_types = ["reflink", "hardlink", "copy"]
    try:
        with external_repo(
            url=url, rev=rev, cache_dir=tmp_dir, cache_types=cache_types
        ) as repo:

            if os.path.isabs(path):
                from dvc.fs.data import DataFileSystem

                fs = DataFileSystem(repo=repo, workspace="local")
                fs_path = path
            else:
                fs = repo.dvcfs
                fs_path = fs.from_os_path(path)

            with Callback.as_tqdm_callback(
                desc=f"Downloading {fs.path.name(path)}",
                unit="files",
            ) as cb:
                fs.get(
                    fs_path,
                    os.path.abspath(out),
                    batch_size=jobs,
                    callback=cb,
                )
    finally:
        remove(tmp_dir)
Esempio n. 9
0
    def _validate_output_path(cls, path, stage=None):
        from dvc.dvcfile import is_valid_filename

        if is_valid_filename(path):
            raise cls.IsStageFileError(path)

        if stage:
            check = stage.repo.tree.dvcignore.check_ignore(path)
            if check.match:
                raise cls.IsIgnoredError(check)
Esempio n. 10
0
    def _validate_output_path(self, path, stage=None):
        from dvc.dvcfile import is_valid_filename

        if is_valid_filename(path):
            raise self.IsStageFileError(path)

        if stage:
            abs_path = os.path.join(stage.wdir, path)
            if self._is_path_dvcignore(abs_path):
                check = stage.repo.dvcignore.check_ignore(abs_path)
                raise self.IsIgnoredError(check)
Esempio n. 11
0
def parse_target(
    target: str, default: str = None, isa_glob: bool = False
) -> Tuple[Optional[str], Optional[str]]:
    from dvc.dvcfile import PIPELINE_FILE, PIPELINE_LOCK, is_valid_filename
    from dvc.exceptions import DvcException
    from dvc.parsing import JOIN

    if not target:
        return None, None

    default = default or PIPELINE_FILE
    if isa_glob:
        path, _, glob = target.rpartition(":")
        return path or default, glob or None

    # look for first "@", so as not to assume too much about stage name
    # eg: it might contain ":" in a generated stages from dict which might
    # affect further parsing with the regex.
    group, _, key = target.partition(JOIN)
    match = TARGET_REGEX.match(group)

    if not match:
        return target, None

    path, name = (
        match.group("path"),
        match.group("name"),
    )

    if name and key:
        name += f"{JOIN}{key}"

    if path:
        if os.path.basename(path) == PIPELINE_LOCK:
            raise DvcException(
                "Did you mean: `{}`?".format(
                    target.replace(".lock", ".yaml", 1)
                )
            )
        if not name:
            ret = (target, None)
            return ret if is_valid_filename(target) else ret[::-1]

    if not path:
        logger.trace(  # type: ignore[attr-defined]
            "Assuming file to be '%s'", default
        )

    return path or default, name
Esempio n. 12
0
    def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False):
        from dvc.dvcfile import is_valid_filename

        assert repo_walk
        try:
            _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else
                                       (None, [], []))
            repo_root, repo_dirs, repo_fnames = next(repo_walk)
        except StopIteration:
            return

        # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs
        dvc_set = set(dvc_dirs)
        repo_set = set(repo_dirs)
        dvc_only = list(dvc_set - repo_set)
        repo_only = list(repo_set - dvc_set)
        shared = list(dvc_set & repo_set)
        dirs = shared + dvc_only + repo_only

        # merge file lists
        files = {
            fname
            for fname in dvc_fnames + repo_fnames
            if dvcfiles or not is_valid_filename(fname)
        }

        yield repo_root, dirs, list(files)

        def is_dvc_repo(d):
            return self._is_dvc_repo(os.path.join(repo_root, d))

        # remove subrepos to prevent it from being traversed
        subrepos = set(filter(is_dvc_repo, repo_only))
        # set dir order for next recursion level - shared dirs first so that
        # next() for both generators recurses into the same shared directory
        dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set]
        repo_dirs[:] = lfilter(lambda d: d in (repo_set - subrepos), dirs)

        for dirname in dirs:
            if dirname in subrepos:
                dir_path = os.path.join(repo_root, dirname)
                yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles)
            elif dirname in shared:
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            elif dirname in dvc_set:
                yield from self._dvc_walk(dvc_walk)
            elif dirname in repo_set:
                yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)
Esempio n. 13
0
    def collect_granular(self, target, *args, **kwargs):
        from ..dvcfile import Dvcfile, is_valid_filename

        if not target:
            return [(stage, None) for stage in self.stages]

        file, name = parse_target(target)
        if is_valid_filename(file) and not kwargs.get("with_deps"):
            # Optimization: do not collect the graph for a specific .dvc target
            stages = Dvcfile(self, file).stages.filter(name)
            return [(stage, None) for stage in stages.values()]

        try:
            (out, ) = self.find_outs_by_path(file, strict=False)
            filter_info = PathInfo(os.path.abspath(file))
            return [(out.stage, filter_info)]
        except OutputNotFoundError:
            stages = self.collect(target, *args, **kwargs)
            return [(stage, None) for stage in stages]
Esempio n. 14
0
    def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False):
        assert repo_walk
        try:
            _, dvc_dirs, dvc_fnames = (next(dvc_walk) if dvc_walk else
                                       (None, [], []))
            repo_root, repo_dirs, repo_fnames = next(repo_walk)
        except StopIteration:
            return

        # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs
        dvc_set = set(dvc_dirs)
        repo_set = set(repo_dirs)
        dvc_only = list(dvc_set - repo_set)
        repo_only = list(repo_set - dvc_set)
        shared = list(dvc_set & repo_set)
        dirs = shared + dvc_only + repo_only

        # merge file lists
        files = {
            fname
            for fname in dvc_fnames + repo_fnames
            if dvcfiles or not is_valid_filename(fname)
        }

        yield repo_root, dirs, list(files)

        # set dir order for next recursion level - shared dirs first so that
        # next() for both generators recurses into the same shared directory
        dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set]
        repo_dirs[:] = [dirname for dirname in dirs if dirname in repo_set]

        for dirname in dirs:
            dir_path = os.path.join(repo_root, dirname)
            if self._is_dvc_repo(dir_path):
                yield from self._subrepo_walk(dir_path, dvcfiles=dvcfiles)
            elif dirname in shared:
                yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles)
            elif dirname in dvc_set:
                yield from self._dvc_walk(dvc_walk)
            elif dirname in repo_set:
                yield from self._walk(repo_walk, None, dvcfiles=dvcfiles)
Esempio n. 15
0
File: tree.py Progetto: bgheneti/dvc
    def _walk(self, dvc_walk, repo_walk, dvcfiles=False):
        try:
            _, dvc_dirs, dvc_fnames = next(dvc_walk)
            repo_root, repo_dirs, repo_fnames = next(repo_walk)
        except StopIteration:
            return

        # separate subdirs into shared dirs, dvc-only dirs, repo-only dirs
        dvc_set = set(dvc_dirs)
        repo_set = set(repo_dirs)
        dvc_only = list(dvc_set - repo_set)
        repo_only = list(repo_set - dvc_set)
        shared = list(dvc_set & repo_set)
        dirs = shared + dvc_only + repo_only

        # merge file lists
        files = {
            fname
            for fname in dvc_fnames + repo_fnames
            if dvcfiles or not is_valid_filename(fname)
        }

        yield repo_root, dirs, list(files)

        # set dir order for next recursion level - shared dirs first so that
        # next() for both generators recurses into the same shared directory
        dvc_dirs[:] = [dirname for dirname in dirs if dirname in dvc_set]
        repo_dirs[:] = [dirname for dirname in dirs if dirname in repo_set]

        for dirname in dirs:
            if dirname in shared:
                yield from self._walk(dvc_walk, repo_walk, dvcfiles=dvcfiles)
            elif dirname in dvc_set:
                yield from self._walk_one(dvc_walk)
            elif dirname in repo_set:
                yield from self._walk_one(repo_walk)
Esempio n. 16
0
    def _validate_output_path(cls, path):
        from dvc.dvcfile import is_valid_filename

        if is_valid_filename(path):
            raise cls.IsStageFileError(path)
Esempio n. 17
0
 def is_dvcfile_and_not_ignored(root, file):
     return is_valid_filename(
         file) and not is_ignored(f"{root}{sep}{file}")