Example #1
0
 def __init__(self, path, message):
     path = relpath(path)
     super().__init__(f"unable to read: '{path}', {message}")
Example #2
0
    def _executors_repro(
        self, executors: dict, jobs: Optional[int] = 1
    ) -> Mapping[str, Mapping[str, str]]:
        """Run dvc repro for the specified BaseExecutors in parallel.

        Returns:
            dict mapping stash revs to the successfully executed experiments
            for each stash rev.
        """
        result: Dict[str, Dict[str, str]] = defaultdict(dict)

        manager = Manager()
        pid_q = manager.Queue()

        rel_cwd = relpath(os.getcwd(), self.repo.root_dir)
        with ProcessPoolExecutor(max_workers=jobs) as workers:
            futures = {}
            for rev, executor in executors.items():
                future = workers.submit(
                    executor.reproduce,
                    executor.dvc_dir,
                    rev,
                    queue=pid_q,
                    name=executor.name,
                    rel_cwd=rel_cwd,
                    log_level=logger.getEffectiveLevel(),
                )
                futures[future] = (rev, executor)

            try:
                wait(futures)
            except KeyboardInterrupt:
                # forward SIGINT to any running executor processes and
                # cancel any remaining futures
                pids = {}
                while not pid_q.empty():
                    rev, pid = pid_q.get()
                    pids[rev] = pid
                for future, (rev, _) in futures.items():
                    if future.running():
                        os.kill(pids[rev], signal.SIGINT)
                    elif not future.done():
                        future.cancel()

            for future, (rev, executor) in futures.items():
                rev, executor = futures[future]

                try:
                    exc = future.exception()
                    if exc is None:
                        exec_result = future.result()
                        result[rev].update(
                            self._collect_executor(executor, exec_result)
                        )
                    elif not isinstance(exc, CheckpointKilledError):
                        logger.error(
                            "Failed to reproduce experiment '%s'", rev[:7],
                        )
                except CancelledError:
                    logger.error(
                        "Cancelled before attempting to reproduce experiment "
                        "'%s'",
                        rev[:7],
                    )
                finally:
                    executor.cleanup()

        return result
Example #3
0
 def relpath(self):
     return relpath(self.path)
Example #4
0
def _log_exceptions(exc: Exception) -> Optional[int]:
    """Try to log some known exceptions, that are not DVCExceptions."""
    from dvc.utils import error_link, format_link

    if isinstance(exc, OSError):
        import errno

        if exc.errno == errno.EMFILE:
            logger.exception(
                "too many open files, please visit "
                "{} to see how to handle this "
                "problem".format(error_link("many-files")),
                extra={"tb_only": True},
            )
        else:
            _log_unknown_exceptions()
        return None

    from dvc.fs import AuthError, ConfigError, RemoteMissingDepsError

    if isinstance(exc, RemoteMissingDepsError):
        from dvc.utils.pkg import PKG

        proto = exc.protocol
        by_pkg = {
            "pip": f"pip install 'dvc[{proto}]'",
            "conda": f"conda install -c conda-forge dvc-{proto}",
        }

        cmd = by_pkg.get(PKG)
        if cmd:
            link = format_link("https://dvc.org/doc/install")
            hint = (f"To install dvc with those dependencies, run:\n"
                    "\n"
                    f"\t{cmd}\n"
                    "\n"
                    f"See {link} for more info.")
        else:
            link = format_link("https://github.com/iterative/dvc/issues")
            hint = f"\nPlease report this bug to {link}. Thank you!"

        logger.exception(
            f"URL '{exc.url}' is supported but requires these missing "
            f"dependencies: {exc.missing_deps}. {hint}",
            extra={"tb_only": True},
        )
        return None

    if isinstance(exc, (AuthError, ConfigError)):
        link = format_link("https://man.dvc.org/remote/modify")
        logger.exception("configuration error")
        logger.exception(
            f"{exc!s}\nLearn more about configuration settings at {link}.",
            extra={"tb_only": True},
        )
        return 251

    from dvc_data.hashfile.cache import DiskError

    if isinstance(exc, DiskError):
        from dvc.utils import relpath

        directory = relpath(exc.directory)
        logger.exception(
            f"Could not open pickled '{exc.type}' cache.\n"
            f"Remove the '{directory}' directory and then retry this command."
            f"\nSee {error_link('pickle')} for more information.",
            extra={"tb_only": True},
        )
        return None

    from dvc_data.stage import IgnoreInCollectedDirError

    if isinstance(exc, IgnoreInCollectedDirError):
        logger.exception("")
        return None

    _log_unknown_exceptions()
    return None
Example #5
0
def _fspath_dir(path, root):
    if not os.path.exists(str(path)):
        return str(path)

    path = relpath(fspath(path), root)
    return os.path.join(path, "") if os.path.isdir(path) else path
Example #6
0
 def __init__(self, output):
     super(OutputNotFoundError, self).__init__(
         "unable to find DVC-file with output '{path}'".format(
             path=relpath(output)))
Example #7
0
 def is_ignored(self, path: str) -> bool:
     return self.repo.path_is_ignored(relpath(path, self.root_dir))
Example #8
0
def check_dvc_filename(path):
    if not is_valid_filename(path):
        raise StageFileBadNameError(
            "bad DVC file name '{}'. DVC files should be named "
            "'{}' or have a '.dvc' suffix (e.g. '{}.dvc').".format(
                relpath(path), PIPELINE_FILE, os.path.basename(path)))
Example #9
0
 def is_ignored(self, path: str) -> bool:
     rel = relpath(path, self.root_dir)
     if os.name == "nt":
         rel.replace("\\", "/")
     return self.repo.path_is_ignored(rel)
Example #10
0
 def __str__(self):
     path = self.__fspath__()
     return relpath(path)
Example #11
0
 def relpath(self, other):
     return self.__class__(relpath(self, other))
Example #12
0
 def __init__(self, output, repo=None):
     self.output = output
     self.repo = repo
     super().__init__("Unable to find DVC-file with output '{path}'".format(
         path=relpath(self.output)))
Example #13
0
 def resolve_wdir(self):
     rel_wdir = relpath(self.wdir, os.path.dirname(self.path))
     return (pathlib.PurePath(rel_wdir).as_posix()
             if rel_wdir != "." else None)
Example #14
0
 def _fix_outs_deps_path(self, wdir):
     for out in chain(self.outs, self.deps):
         if out.is_in_repo:
             out.def_path = relpath(out.path_info, wdir)
Example #15
0
 def __init__(self, path):
     super(BadMetricError, self).__init__(
         "'{}' does not exist, not a metric or is malformed".format(
             relpath(path)))
Example #16
0
    def collect(
        self,
        targets: List[str] = None,
        revs: List[str] = None,
        recursive: bool = False,
    ) -> Dict[str, Dict]:
        """Collects all props and data for plots.

        Returns a structure like:
            {rev: {plots.csv: {
                props: {x: ..., "header": ..., ...},
                data: "...data as a string...",
            }}}
        Data parsing is postponed, since it's affected by props.
        """
        from dvc.config import NoRemoteError
        from dvc.fs.repo import RepoFileSystem
        from dvc.utils.collections import ensure_list

        targets = ensure_list(targets)
        data: Dict[str, Dict] = {}
        for rev in self.repo.brancher(revs=revs):
            # .brancher() adds unwanted workspace
            if revs is not None and rev not in revs:
                continue
            rev = rev or "workspace"

            fs = RepoFileSystem(self.repo)
            plots = _collect_plots(self.repo, targets, rev, recursive)
            for path_info, props in plots.items():

                if rev not in data:
                    data[rev] = {}

                if fs.isdir(path_info):
                    plot_files = []
                    try:
                        for pi in fs.walk_files(path_info):
                            plot_files.append(
                                (pi, relpath(pi, self.repo.root_dir))
                            )
                    except NoRemoteError:
                        logger.debug(
                            (
                                "Could not find cache for directory '%s' on "
                                "'%s'. Files inside will not be plotted."
                            ),
                            path_info,
                            rev,
                        )
                        continue
                else:
                    plot_files = [
                        (path_info, relpath(path_info, self.repo.root_dir))
                    ]

                for path, repo_path in plot_files:
                    data[rev].update({repo_path: {"props": props}})

                    # Load data from git or dvc cache
                    try:
                        with fs.open(path) as fd:
                            data[rev][repo_path]["data"] = fd.read()
                    except FileNotFoundError:
                        # This might happen simply because cache is absent
                        logger.debug(
                            (
                                "Could not find '%s' on '%s'. "
                                "File will not be plotted"
                            ),
                            path,
                            rev,
                        )
                    except UnicodeDecodeError:
                        logger.debug(
                            (
                                "'%s' at '%s' is binary file. It will not be "
                                "plotted."
                            ),
                            path,
                            rev,
                        )

        return data
Example #17
0
 def __repr__(self):
     return "{}: {}".format(self.__class__.__name__,
                            relpath(self.path, self.repo.root_dir))
Example #18
0
File: __init__.py Project: Suor/dvc
    def _executors_repro(
        self, executors: dict, jobs: Optional[int] = 1
    ) -> Dict[str, Dict[str, str]]:
        """Run dvc repro for the specified BaseExecutors in parallel.

        Returns:
            dict mapping stash revs to the successfully executed experiments
            for each stash rev.
        """
        result: Dict[str, Dict[str, str]] = defaultdict(dict)

        manager = Manager()
        pid_q = manager.Queue()

        rel_cwd = relpath(os.getcwd(), self.repo.root_dir)
        with ProcessPoolExecutor(max_workers=jobs) as workers:
            futures = {}
            for rev, executor in executors.items():
                pidfile = os.path.join(
                    self.repo.tmp_dir,
                    self.EXEC_PID_DIR,
                    f"{rev}{executor.PIDFILE_EXT}",
                )
                future = workers.submit(
                    executor.reproduce,
                    executor.dvc_dir,
                    rev,
                    queue=pid_q,
                    name=executor.name,
                    rel_cwd=rel_cwd,
                    log_level=logger.getEffectiveLevel(),
                    pidfile=pidfile,
                    git_url=executor.git_url,
                )
                futures[future] = (rev, executor)

            try:
                wait(futures)
            except KeyboardInterrupt:
                # forward SIGINT to any running executor processes and
                # cancel any remaining futures
                workers.shutdown(wait=False)
                pids = {}
                for future, (rev, _) in futures.items():
                    if future.running():
                        # if future has already been started by the scheduler
                        # we still have to wait until it tells us its PID
                        while rev not in pids:
                            rev, pid = pid_q.get()
                            pids[rev] = pid
                        os.kill(pids[rev], signal.SIGINT)
                    elif not future.done():
                        future.cancel()

            for future, (rev, executor) in futures.items():
                rev, executor = futures[future]

                try:
                    exc = future.exception()
                    if exc is None:
                        exec_result = future.result()
                        result[rev].update(
                            self._collect_executor(executor, exec_result)
                        )
                    elif not isinstance(exc, CheckpointKilledError):
                        logger.error(
                            "Failed to reproduce experiment '%s'", rev[:7]
                        )
                except CancelledError:
                    logger.error(
                        "Cancelled before attempting to reproduce experiment "
                        "'%s'",
                        rev[:7],
                    )
                finally:
                    executor.cleanup()

        return result
Example #19
0
    def __init__(self, path, external_repo_path, external_repo_url):
        from dvc.utils import relpath

        super().__init__(
            "Output '{}' not found in target repository '{}'".format(
                relpath(path, external_repo_path), external_repo_url))
Example #20
0
File: __init__.py Project: Suor/dvc
    def _workspace_repro(self) -> Mapping[str, str]:
        """Run the most recently stashed experiment in the workspace."""
        from dvc.utils.fs import makedirs

        from .executor.base import BaseExecutor

        entry = first(self.stash_revs.values())
        assert entry.index == 0

        # NOTE: the stash commit to be popped already contains all the current
        # workspace changes plus CLI modified --params changes.
        # `checkout --force` here will not lose any data (popping stash commit
        # will result in conflict between workspace params and stashed CLI
        # params, but we always want the stashed version).
        with self.scm.detach_head(entry.rev, force=True):
            rev = self.stash.pop()
            self.scm.set_ref(EXEC_BASELINE, entry.baseline_rev)
            if entry.branch:
                self.scm.set_ref(EXEC_BRANCH, entry.branch, symbolic=True)
            elif self.scm.get_ref(EXEC_BRANCH):
                self.scm.remove_ref(EXEC_BRANCH)
            try:
                orig_checkpoint = self.scm.get_ref(EXEC_CHECKPOINT)
                pid_dir = os.path.join(self.repo.tmp_dir, self.EXEC_PID_DIR)
                if not os.path.exists(pid_dir):
                    makedirs(pid_dir)
                pidfile = os.path.join(
                    pid_dir, f"workspace{BaseExecutor.PIDFILE_EXT}"
                )
                exec_result = BaseExecutor.reproduce(
                    None,
                    rev,
                    name=entry.name,
                    rel_cwd=relpath(os.getcwd(), self.scm.root_dir),
                    log_errors=False,
                    pidfile=pidfile,
                )

                if not exec_result.exp_hash:
                    raise DvcException(
                        f"Failed to reproduce experiment '{rev[:7]}'"
                    )
                if not exec_result.ref_info:
                    # repro succeeded but result matches baseline
                    # (no experiment generated or applied)
                    return {}
                exp_rev = self.scm.get_ref(str(exec_result.ref_info))
                self.scm.set_ref(EXEC_APPLY, exp_rev)
                return {exp_rev: exec_result.exp_hash}
            except CheckpointKilledError:
                # Checkpoint errors have already been logged
                return {}
            except DvcException:
                raise
            except Exception as exc:
                raise DvcException(
                    f"Failed to reproduce experiment '{rev[:7]}'"
                ) from exc
            finally:
                self.scm.remove_ref(EXEC_BASELINE)
                if entry.branch:
                    self.scm.remove_ref(EXEC_BRANCH)
                checkpoint = self.scm.get_ref(EXEC_CHECKPOINT)
                if checkpoint and checkpoint != orig_checkpoint:
                    self.scm.set_ref(EXEC_APPLY, checkpoint)
Example #21
0
 def is_ignored(self, path: str) -> bool:
     # `is_ignored` returns `false` if excluded in `.gitignore` and
     # `None` if it's not mentioned at all. `True` if it is ignored.
     return bool(
         self.ignore_manager.is_ignored(relpath(path, self.root_dir))
     )
Example #22
0
 def is_tracked(self, path):
     # it is equivalent to `bool(self.repo.git.ls_files(path))` by
     # functionality, but ls_files fails on unicode filenames
     path = relpath(path, self.root_dir)
     # There are 4 stages, see BaseIndexEntry.stage
     return any((path, i) in self.repo.index.entries for i in (0, 1, 2, 3))
Example #23
0
 def is_tracked(self, path):
     # it is equivalent to `bool(self.repo.git.ls_files(path))` by
     # functionality, but ls_files fails on unicode filenames
     path = relpath(path, self.root_dir)
     return path in [i[0] for i in self.repo.index.entries]
Example #24
0
File: stage.py Project: nikie/dvc
    def create(repo, **kwargs):

        wdir = kwargs.get("wdir", None)
        cwd = kwargs.get("cwd", None)
        fname = kwargs.get("fname", None)
        add = kwargs.get("add", False)

        # Backward compatibility for `cwd` option
        if wdir is None and cwd is not None:
            if fname is not None and os.path.basename(fname) != fname:
                raise StageFileBadNameError(
                    "DVC-file name '{fname}' may not contain subdirectories"
                    " if `-c|--cwd` (deprecated) is specified. Use `-w|--wdir`"
                    " along with `-f` to specify DVC-file path with working"
                    " directory.".format(fname=fname))
            wdir = cwd
        elif wdir is None:
            wdir = os.curdir

        stage = Stage(
            repo=repo,
            wdir=wdir,
            cmd=kwargs.get("cmd", None),
            locked=kwargs.get("locked", False),
            always_changed=kwargs.get("always_changed", False),
        )

        Stage._fill_stage_outputs(stage, **kwargs)
        stage.deps = dependency.loads_from(stage,
                                           kwargs.get("deps", []),
                                           erepo=kwargs.get("erepo", None))

        stage._check_circular_dependency()
        stage._check_duplicated_arguments()

        if not fname:
            fname = Stage._stage_fname(stage.outs, add)
        stage._check_dvc_filename(fname)

        # Autodetecting wdir for add, we need to create outs first to do that,
        # so we start with wdir = . and remap out paths later.
        if add and kwargs.get("wdir") is None and cwd is None:
            wdir = os.path.dirname(fname)

            for out in chain(stage.outs, stage.deps):
                if out.is_in_repo:
                    out.def_path = relpath(out.path_info, wdir)

        wdir = os.path.abspath(wdir)

        if cwd is not None:
            path = os.path.join(wdir, fname)
        else:
            path = os.path.abspath(fname)

        Stage._check_stage_path(repo, wdir)
        Stage._check_stage_path(repo, os.path.dirname(path))

        stage.wdir = wdir
        stage.path = path

        ignore_build_cache = kwargs.get("ignore_build_cache", False)

        # NOTE: remove outs before we check build cache
        if kwargs.get("remove_outs", False):
            logger.warning("--remove-outs is deprecated."
                           " It is now the default behavior,"
                           " so there's no need to use this option anymore.")
            stage.remove_outs(ignore_remove=False)
            logger.warning("Build cache is ignored when using --remove-outs.")
            ignore_build_cache = True

        if os.path.exists(path) and any(out.persist for out in stage.outs):
            logger.warning("Build cache is ignored when persisting outputs.")
            ignore_build_cache = True

        if os.path.exists(path):
            if (not ignore_build_cache and stage.is_cached
                    and not stage.is_callback and not stage.always_changed):
                logger.info("Stage is cached, skipping.")
                return None

            msg = ("'{}' already exists. Do you wish to run the command and "
                   "overwrite it?".format(stage.relpath))

            if not kwargs.get("overwrite", True) and not prompt.confirm(msg):
                raise StageFileAlreadyExistsError(stage.relpath)

            os.unlink(path)

        return stage
Example #25
0
 def dvc_dir(self):
     return relpath(self.repo.dvc_dir, self.repo.scm.root_dir)
Example #26
0
File: stage.py Project: nikie/dvc
 def _check_dvc_filename(fname):
     if not Stage.is_valid_filename(fname):
         raise StageFileBadNameError(
             "bad DVC-file name '{}'. DVC-files should be named "
             "'Dvcfile' or have a '.dvc' suffix (e.g. '{}.dvc').".format(
                 relpath(fname), os.path.basename(fname)))
Example #27
0
 def path_in_repo(self):
     return relpath(self.path, self.repo.root_dir)
Example #28
0
def test_str_workdir_outside_repo(tmp_dir, erepo_dir):
    stage = Stage(erepo_dir.dvc)
    output = LocalOutput(stage, "path", cache=False)

    assert relpath("path", erepo_dir.dvc.root_dir) == str(output)
Example #29
0
    def graph(self, stages=None, from_directory=None):
        """Generate a graph by using the given stages on the given directory

        The nodes of the graph are the stage's path relative to the root.

        Edges are created when the output of one stage is used as a
        dependency in other stage.

        The direction of the edges goes from the stage to its dependency:

        For example, running the following:

            $ dvc run -o A "echo A > A"
            $ dvc run -d A -o B "echo B > B"
            $ dvc run -d B -o C "echo C > C"

        Will create the following graph:

               ancestors <--
                           |
                C.dvc -> B.dvc -> A.dvc
                |          |
                |          --> descendants
                |
                ------- pipeline ------>
                           |
                           v
              (weakly connected components)

        Args:
            stages (list): used to build a graph, if None given, use the ones
                on the `from_directory`.

            from_directory (str): directory where to look at for stages, if
                None is given, use the current working directory

        Raises:
            OutputDuplicationError: two outputs with the same path
            StagePathAsOutputError: stage inside an output directory
            OverlappingOutputPathsError: output inside output directory
            CyclicGraphError: resulting graph has cycles
        """
        import networkx as nx
        from dvc.exceptions import (
            OutputDuplicationError,
            StagePathAsOutputError,
            OverlappingOutputPathsError,
        )

        G = nx.DiGraph()
        G_active = nx.DiGraph()
        stages = stages or self.stages(from_directory, check_dag=False)
        stages = [stage for stage in stages if stage]
        outs = []

        for stage in stages:
            for out in stage.outs:
                existing = []
                for o in outs:
                    if o.path_info == out.path_info:
                        existing.append(o.stage)

                    in_o_dir = out.path_info.isin(o.path_info)
                    in_out_dir = o.path_info.isin(out.path_info)
                    if in_o_dir or in_out_dir:
                        raise OverlappingOutputPathsError(o, out)

                if existing:
                    stages = [stage.relpath, existing[0].relpath]
                    raise OutputDuplicationError(str(out), stages)

                outs.append(out)

        for stage in stages:
            stage_path_info = PathInfo(stage.path)
            for out in outs:
                if stage_path_info.isin(out.path_info):
                    raise StagePathAsOutputError(stage.wdir, stage.relpath)

        for stage in stages:
            node = relpath(stage.path, self.root_dir)

            G.add_node(node, stage=stage)
            G_active.add_node(node, stage=stage)

            for dep in stage.deps:
                for out in outs:
                    if (out.path_info != dep.path_info
                            and not dep.path_info.isin(out.path_info)
                            and not out.path_info.isin(dep.path_info)):
                        continue

                    dep_stage = out.stage
                    dep_node = relpath(dep_stage.path, self.root_dir)
                    G.add_node(dep_node, stage=dep_stage)
                    G.add_edge(node, dep_node)
                    if not stage.locked:
                        G_active.add_node(dep_node, stage=dep_stage)
                        G_active.add_edge(node, dep_node)

        self._check_cyclic_graph(G)

        return G, G_active
Example #30
0
def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False):
    """
    Creates an empty repo on the given directory -- basically a
    `.dvc` directory with subdirectories for configuration and cache.

    It should be tracked by a SCM or use the `--no-scm` flag.

    If the given directory is not empty, you must use the `--force`
    flag to override it.

    Args:
        root_dir: Path to repo's root directory.

    Returns:
        Repo instance.

    Raises:
        KeyError: Raises an exception.
    """

    if no_scm and subdir:
        raise InvalidArgumentError(
            "Cannot initialize repo with `--no-scm` and `--subdir`")

    root_dir = os.path.realpath(root_dir)
    dvc_dir = os.path.join(root_dir, Repo.DVC_DIR)

    try:
        scm = SCM(root_dir, search_parent_directories=subdir, no_scm=no_scm)
    except SCMError:
        raise InitError(
            "{repo} is not tracked by any supported SCM tool (e.g. Git). "
            "Use `--no-scm` if you don't want to use any SCM or "
            "`--subdir` if initializing inside a subdirectory of a parent SCM "
            "repository.".format(repo=root_dir))

    if os.path.isdir(dvc_dir):
        if not force:
            raise InitError("'{repo}' exists. Use `-f` to force.".format(
                repo=relpath(dvc_dir)))

        remove(dvc_dir)

    os.mkdir(dvc_dir)

    config = Config.init(dvc_dir)

    if no_scm:
        with config.edit() as conf:
            conf["core"]["no_scm"] = True

    dvcignore = init_dvcignore(root_dir)

    proj = Repo(root_dir)

    scm.add(
        [config.files["repo"], dvcignore, proj.plots.templates.templates_dir])

    if scm.ignore_file:
        scm.add([os.path.join(dvc_dir, scm.ignore_file)])
        logger.info("\nYou can now commit the changes to git.\n")

    _welcome_message()

    return proj