def __init__(self, path, message): path = relpath(path) super().__init__(f"unable to read: '{path}', {message}")
def _executors_repro( self, executors: dict, jobs: Optional[int] = 1 ) -> Mapping[str, Mapping[str, str]]: """Run dvc repro for the specified BaseExecutors in parallel. Returns: dict mapping stash revs to the successfully executed experiments for each stash rev. """ result: Dict[str, Dict[str, str]] = defaultdict(dict) manager = Manager() pid_q = manager.Queue() rel_cwd = relpath(os.getcwd(), self.repo.root_dir) with ProcessPoolExecutor(max_workers=jobs) as workers: futures = {} for rev, executor in executors.items(): future = workers.submit( executor.reproduce, executor.dvc_dir, rev, queue=pid_q, name=executor.name, rel_cwd=rel_cwd, log_level=logger.getEffectiveLevel(), ) futures[future] = (rev, executor) try: wait(futures) except KeyboardInterrupt: # forward SIGINT to any running executor processes and # cancel any remaining futures pids = {} while not pid_q.empty(): rev, pid = pid_q.get() pids[rev] = pid for future, (rev, _) in futures.items(): if future.running(): os.kill(pids[rev], signal.SIGINT) elif not future.done(): future.cancel() for future, (rev, executor) in futures.items(): rev, executor = futures[future] try: exc = future.exception() if exc is None: exec_result = future.result() result[rev].update( self._collect_executor(executor, exec_result) ) elif not isinstance(exc, CheckpointKilledError): logger.error( "Failed to reproduce experiment '%s'", rev[:7], ) except CancelledError: logger.error( "Cancelled before attempting to reproduce experiment " "'%s'", rev[:7], ) finally: executor.cleanup() return result
def relpath(self): return relpath(self.path)
def _log_exceptions(exc: Exception) -> Optional[int]: """Try to log some known exceptions, that are not DVCExceptions.""" from dvc.utils import error_link, format_link if isinstance(exc, OSError): import errno if exc.errno == errno.EMFILE: logger.exception( "too many open files, please visit " "{} to see how to handle this " "problem".format(error_link("many-files")), extra={"tb_only": True}, ) else: _log_unknown_exceptions() return None from dvc.fs import AuthError, ConfigError, RemoteMissingDepsError if isinstance(exc, RemoteMissingDepsError): from dvc.utils.pkg import PKG proto = exc.protocol by_pkg = { "pip": f"pip install 'dvc[{proto}]'", "conda": f"conda install -c conda-forge dvc-{proto}", } cmd = by_pkg.get(PKG) if cmd: link = format_link("https://dvc.org/doc/install") hint = (f"To install dvc with those dependencies, run:\n" "\n" f"\t{cmd}\n" "\n" f"See {link} for more info.") else: link = format_link("https://github.com/iterative/dvc/issues") hint = f"\nPlease report this bug to {link}. Thank you!" logger.exception( f"URL '{exc.url}' is supported but requires these missing " f"dependencies: {exc.missing_deps}. {hint}", extra={"tb_only": True}, ) return None if isinstance(exc, (AuthError, ConfigError)): link = format_link("https://man.dvc.org/remote/modify") logger.exception("configuration error") logger.exception( f"{exc!s}\nLearn more about configuration settings at {link}.", extra={"tb_only": True}, ) return 251 from dvc_data.hashfile.cache import DiskError if isinstance(exc, DiskError): from dvc.utils import relpath directory = relpath(exc.directory) logger.exception( f"Could not open pickled '{exc.type}' cache.\n" f"Remove the '{directory}' directory and then retry this command." f"\nSee {error_link('pickle')} for more information.", extra={"tb_only": True}, ) return None from dvc_data.stage import IgnoreInCollectedDirError if isinstance(exc, IgnoreInCollectedDirError): logger.exception("") return None _log_unknown_exceptions() return None
def _fspath_dir(path, root): if not os.path.exists(str(path)): return str(path) path = relpath(fspath(path), root) return os.path.join(path, "") if os.path.isdir(path) else path
def __init__(self, output): super(OutputNotFoundError, self).__init__( "unable to find DVC-file with output '{path}'".format( path=relpath(output)))
def is_ignored(self, path: str) -> bool: return self.repo.path_is_ignored(relpath(path, self.root_dir))
def check_dvc_filename(path): if not is_valid_filename(path): raise StageFileBadNameError( "bad DVC file name '{}'. DVC files should be named " "'{}' or have a '.dvc' suffix (e.g. '{}.dvc').".format( relpath(path), PIPELINE_FILE, os.path.basename(path)))
def is_ignored(self, path: str) -> bool: rel = relpath(path, self.root_dir) if os.name == "nt": rel.replace("\\", "/") return self.repo.path_is_ignored(rel)
def __str__(self): path = self.__fspath__() return relpath(path)
def relpath(self, other): return self.__class__(relpath(self, other))
def __init__(self, output, repo=None): self.output = output self.repo = repo super().__init__("Unable to find DVC-file with output '{path}'".format( path=relpath(self.output)))
def resolve_wdir(self): rel_wdir = relpath(self.wdir, os.path.dirname(self.path)) return (pathlib.PurePath(rel_wdir).as_posix() if rel_wdir != "." else None)
def _fix_outs_deps_path(self, wdir): for out in chain(self.outs, self.deps): if out.is_in_repo: out.def_path = relpath(out.path_info, wdir)
def __init__(self, path): super(BadMetricError, self).__init__( "'{}' does not exist, not a metric or is malformed".format( relpath(path)))
def collect( self, targets: List[str] = None, revs: List[str] = None, recursive: bool = False, ) -> Dict[str, Dict]: """Collects all props and data for plots. Returns a structure like: {rev: {plots.csv: { props: {x: ..., "header": ..., ...}, data: "...data as a string...", }}} Data parsing is postponed, since it's affected by props. """ from dvc.config import NoRemoteError from dvc.fs.repo import RepoFileSystem from dvc.utils.collections import ensure_list targets = ensure_list(targets) data: Dict[str, Dict] = {} for rev in self.repo.brancher(revs=revs): # .brancher() adds unwanted workspace if revs is not None and rev not in revs: continue rev = rev or "workspace" fs = RepoFileSystem(self.repo) plots = _collect_plots(self.repo, targets, rev, recursive) for path_info, props in plots.items(): if rev not in data: data[rev] = {} if fs.isdir(path_info): plot_files = [] try: for pi in fs.walk_files(path_info): plot_files.append( (pi, relpath(pi, self.repo.root_dir)) ) except NoRemoteError: logger.debug( ( "Could not find cache for directory '%s' on " "'%s'. Files inside will not be plotted." ), path_info, rev, ) continue else: plot_files = [ (path_info, relpath(path_info, self.repo.root_dir)) ] for path, repo_path in plot_files: data[rev].update({repo_path: {"props": props}}) # Load data from git or dvc cache try: with fs.open(path) as fd: data[rev][repo_path]["data"] = fd.read() except FileNotFoundError: # This might happen simply because cache is absent logger.debug( ( "Could not find '%s' on '%s'. " "File will not be plotted" ), path, rev, ) except UnicodeDecodeError: logger.debug( ( "'%s' at '%s' is binary file. It will not be " "plotted." ), path, rev, ) return data
def __repr__(self): return "{}: {}".format(self.__class__.__name__, relpath(self.path, self.repo.root_dir))
def _executors_repro( self, executors: dict, jobs: Optional[int] = 1 ) -> Dict[str, Dict[str, str]]: """Run dvc repro for the specified BaseExecutors in parallel. Returns: dict mapping stash revs to the successfully executed experiments for each stash rev. """ result: Dict[str, Dict[str, str]] = defaultdict(dict) manager = Manager() pid_q = manager.Queue() rel_cwd = relpath(os.getcwd(), self.repo.root_dir) with ProcessPoolExecutor(max_workers=jobs) as workers: futures = {} for rev, executor in executors.items(): pidfile = os.path.join( self.repo.tmp_dir, self.EXEC_PID_DIR, f"{rev}{executor.PIDFILE_EXT}", ) future = workers.submit( executor.reproduce, executor.dvc_dir, rev, queue=pid_q, name=executor.name, rel_cwd=rel_cwd, log_level=logger.getEffectiveLevel(), pidfile=pidfile, git_url=executor.git_url, ) futures[future] = (rev, executor) try: wait(futures) except KeyboardInterrupt: # forward SIGINT to any running executor processes and # cancel any remaining futures workers.shutdown(wait=False) pids = {} for future, (rev, _) in futures.items(): if future.running(): # if future has already been started by the scheduler # we still have to wait until it tells us its PID while rev not in pids: rev, pid = pid_q.get() pids[rev] = pid os.kill(pids[rev], signal.SIGINT) elif not future.done(): future.cancel() for future, (rev, executor) in futures.items(): rev, executor = futures[future] try: exc = future.exception() if exc is None: exec_result = future.result() result[rev].update( self._collect_executor(executor, exec_result) ) elif not isinstance(exc, CheckpointKilledError): logger.error( "Failed to reproduce experiment '%s'", rev[:7] ) except CancelledError: logger.error( "Cancelled before attempting to reproduce experiment " "'%s'", rev[:7], ) finally: executor.cleanup() return result
def __init__(self, path, external_repo_path, external_repo_url): from dvc.utils import relpath super().__init__( "Output '{}' not found in target repository '{}'".format( relpath(path, external_repo_path), external_repo_url))
def _workspace_repro(self) -> Mapping[str, str]: """Run the most recently stashed experiment in the workspace.""" from dvc.utils.fs import makedirs from .executor.base import BaseExecutor entry = first(self.stash_revs.values()) assert entry.index == 0 # NOTE: the stash commit to be popped already contains all the current # workspace changes plus CLI modified --params changes. # `checkout --force` here will not lose any data (popping stash commit # will result in conflict between workspace params and stashed CLI # params, but we always want the stashed version). with self.scm.detach_head(entry.rev, force=True): rev = self.stash.pop() self.scm.set_ref(EXEC_BASELINE, entry.baseline_rev) if entry.branch: self.scm.set_ref(EXEC_BRANCH, entry.branch, symbolic=True) elif self.scm.get_ref(EXEC_BRANCH): self.scm.remove_ref(EXEC_BRANCH) try: orig_checkpoint = self.scm.get_ref(EXEC_CHECKPOINT) pid_dir = os.path.join(self.repo.tmp_dir, self.EXEC_PID_DIR) if not os.path.exists(pid_dir): makedirs(pid_dir) pidfile = os.path.join( pid_dir, f"workspace{BaseExecutor.PIDFILE_EXT}" ) exec_result = BaseExecutor.reproduce( None, rev, name=entry.name, rel_cwd=relpath(os.getcwd(), self.scm.root_dir), log_errors=False, pidfile=pidfile, ) if not exec_result.exp_hash: raise DvcException( f"Failed to reproduce experiment '{rev[:7]}'" ) if not exec_result.ref_info: # repro succeeded but result matches baseline # (no experiment generated or applied) return {} exp_rev = self.scm.get_ref(str(exec_result.ref_info)) self.scm.set_ref(EXEC_APPLY, exp_rev) return {exp_rev: exec_result.exp_hash} except CheckpointKilledError: # Checkpoint errors have already been logged return {} except DvcException: raise except Exception as exc: raise DvcException( f"Failed to reproduce experiment '{rev[:7]}'" ) from exc finally: self.scm.remove_ref(EXEC_BASELINE) if entry.branch: self.scm.remove_ref(EXEC_BRANCH) checkpoint = self.scm.get_ref(EXEC_CHECKPOINT) if checkpoint and checkpoint != orig_checkpoint: self.scm.set_ref(EXEC_APPLY, checkpoint)
def is_ignored(self, path: str) -> bool: # `is_ignored` returns `false` if excluded in `.gitignore` and # `None` if it's not mentioned at all. `True` if it is ignored. return bool( self.ignore_manager.is_ignored(relpath(path, self.root_dir)) )
def is_tracked(self, path): # it is equivalent to `bool(self.repo.git.ls_files(path))` by # functionality, but ls_files fails on unicode filenames path = relpath(path, self.root_dir) # There are 4 stages, see BaseIndexEntry.stage return any((path, i) in self.repo.index.entries for i in (0, 1, 2, 3))
def is_tracked(self, path): # it is equivalent to `bool(self.repo.git.ls_files(path))` by # functionality, but ls_files fails on unicode filenames path = relpath(path, self.root_dir) return path in [i[0] for i in self.repo.index.entries]
def create(repo, **kwargs): wdir = kwargs.get("wdir", None) cwd = kwargs.get("cwd", None) fname = kwargs.get("fname", None) add = kwargs.get("add", False) # Backward compatibility for `cwd` option if wdir is None and cwd is not None: if fname is not None and os.path.basename(fname) != fname: raise StageFileBadNameError( "DVC-file name '{fname}' may not contain subdirectories" " if `-c|--cwd` (deprecated) is specified. Use `-w|--wdir`" " along with `-f` to specify DVC-file path with working" " directory.".format(fname=fname)) wdir = cwd elif wdir is None: wdir = os.curdir stage = Stage( repo=repo, wdir=wdir, cmd=kwargs.get("cmd", None), locked=kwargs.get("locked", False), always_changed=kwargs.get("always_changed", False), ) Stage._fill_stage_outputs(stage, **kwargs) stage.deps = dependency.loads_from(stage, kwargs.get("deps", []), erepo=kwargs.get("erepo", None)) stage._check_circular_dependency() stage._check_duplicated_arguments() if not fname: fname = Stage._stage_fname(stage.outs, add) stage._check_dvc_filename(fname) # Autodetecting wdir for add, we need to create outs first to do that, # so we start with wdir = . and remap out paths later. if add and kwargs.get("wdir") is None and cwd is None: wdir = os.path.dirname(fname) for out in chain(stage.outs, stage.deps): if out.is_in_repo: out.def_path = relpath(out.path_info, wdir) wdir = os.path.abspath(wdir) if cwd is not None: path = os.path.join(wdir, fname) else: path = os.path.abspath(fname) Stage._check_stage_path(repo, wdir) Stage._check_stage_path(repo, os.path.dirname(path)) stage.wdir = wdir stage.path = path ignore_build_cache = kwargs.get("ignore_build_cache", False) # NOTE: remove outs before we check build cache if kwargs.get("remove_outs", False): logger.warning("--remove-outs is deprecated." " It is now the default behavior," " so there's no need to use this option anymore.") stage.remove_outs(ignore_remove=False) logger.warning("Build cache is ignored when using --remove-outs.") ignore_build_cache = True if os.path.exists(path) and any(out.persist for out in stage.outs): logger.warning("Build cache is ignored when persisting outputs.") ignore_build_cache = True if os.path.exists(path): if (not ignore_build_cache and stage.is_cached and not stage.is_callback and not stage.always_changed): logger.info("Stage is cached, skipping.") return None msg = ("'{}' already exists. Do you wish to run the command and " "overwrite it?".format(stage.relpath)) if not kwargs.get("overwrite", True) and not prompt.confirm(msg): raise StageFileAlreadyExistsError(stage.relpath) os.unlink(path) return stage
def dvc_dir(self): return relpath(self.repo.dvc_dir, self.repo.scm.root_dir)
def _check_dvc_filename(fname): if not Stage.is_valid_filename(fname): raise StageFileBadNameError( "bad DVC-file name '{}'. DVC-files should be named " "'Dvcfile' or have a '.dvc' suffix (e.g. '{}.dvc').".format( relpath(fname), os.path.basename(fname)))
def path_in_repo(self): return relpath(self.path, self.repo.root_dir)
def test_str_workdir_outside_repo(tmp_dir, erepo_dir): stage = Stage(erepo_dir.dvc) output = LocalOutput(stage, "path", cache=False) assert relpath("path", erepo_dir.dvc.root_dir) == str(output)
def graph(self, stages=None, from_directory=None): """Generate a graph by using the given stages on the given directory The nodes of the graph are the stage's path relative to the root. Edges are created when the output of one stage is used as a dependency in other stage. The direction of the edges goes from the stage to its dependency: For example, running the following: $ dvc run -o A "echo A > A" $ dvc run -d A -o B "echo B > B" $ dvc run -d B -o C "echo C > C" Will create the following graph: ancestors <-- | C.dvc -> B.dvc -> A.dvc | | | --> descendants | ------- pipeline ------> | v (weakly connected components) Args: stages (list): used to build a graph, if None given, use the ones on the `from_directory`. from_directory (str): directory where to look at for stages, if None is given, use the current working directory Raises: OutputDuplicationError: two outputs with the same path StagePathAsOutputError: stage inside an output directory OverlappingOutputPathsError: output inside output directory CyclicGraphError: resulting graph has cycles """ import networkx as nx from dvc.exceptions import ( OutputDuplicationError, StagePathAsOutputError, OverlappingOutputPathsError, ) G = nx.DiGraph() G_active = nx.DiGraph() stages = stages or self.stages(from_directory, check_dag=False) stages = [stage for stage in stages if stage] outs = [] for stage in stages: for out in stage.outs: existing = [] for o in outs: if o.path_info == out.path_info: existing.append(o.stage) in_o_dir = out.path_info.isin(o.path_info) in_out_dir = o.path_info.isin(out.path_info) if in_o_dir or in_out_dir: raise OverlappingOutputPathsError(o, out) if existing: stages = [stage.relpath, existing[0].relpath] raise OutputDuplicationError(str(out), stages) outs.append(out) for stage in stages: stage_path_info = PathInfo(stage.path) for out in outs: if stage_path_info.isin(out.path_info): raise StagePathAsOutputError(stage.wdir, stage.relpath) for stage in stages: node = relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) G_active.add_node(node, stage=stage) for dep in stage.deps: for out in outs: if (out.path_info != dep.path_info and not dep.path_info.isin(out.path_info) and not out.path_info.isin(dep.path_info)): continue dep_stage = out.stage dep_node = relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) if not stage.locked: G_active.add_node(dep_node, stage=dep_stage) G_active.add_edge(node, dep_node) self._check_cyclic_graph(G) return G, G_active
def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): """ Creates an empty repo on the given directory -- basically a `.dvc` directory with subdirectories for configuration and cache. It should be tracked by a SCM or use the `--no-scm` flag. If the given directory is not empty, you must use the `--force` flag to override it. Args: root_dir: Path to repo's root directory. Returns: Repo instance. Raises: KeyError: Raises an exception. """ if no_scm and subdir: raise InvalidArgumentError( "Cannot initialize repo with `--no-scm` and `--subdir`") root_dir = os.path.realpath(root_dir) dvc_dir = os.path.join(root_dir, Repo.DVC_DIR) try: scm = SCM(root_dir, search_parent_directories=subdir, no_scm=no_scm) except SCMError: raise InitError( "{repo} is not tracked by any supported SCM tool (e.g. Git). " "Use `--no-scm` if you don't want to use any SCM or " "`--subdir` if initializing inside a subdirectory of a parent SCM " "repository.".format(repo=root_dir)) if os.path.isdir(dvc_dir): if not force: raise InitError("'{repo}' exists. Use `-f` to force.".format( repo=relpath(dvc_dir))) remove(dvc_dir) os.mkdir(dvc_dir) config = Config.init(dvc_dir) if no_scm: with config.edit() as conf: conf["core"]["no_scm"] = True dvcignore = init_dvcignore(root_dir) proj = Repo(root_dir) scm.add( [config.files["repo"], dvcignore, proj.plots.templates.templates_dir]) if scm.ignore_file: scm.add([os.path.join(dvc_dir, scm.ignore_file)]) logger.info("\nYou can now commit the changes to git.\n") _welcome_message() return proj