def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): """ Creates an empty repo on the given directory -- basically a `.dvc` directory with subdirectories for configuration and cache. It should be tracked by a SCM or use the `--no-scm` flag. If the given directory is not empty, you must use the `--force` flag to override it. Args: root_dir: Path to repo's root directory. Returns: Repo instance. Raises: KeyError: Raises an exception. """ if no_scm and subdir: raise InvalidArgumentError( "Cannot initialize repo with `--no-scm` and `--subdir`") root_dir = os.path.realpath(root_dir) dvc_dir = os.path.join(root_dir, Repo.DVC_DIR) try: scm = SCM(root_dir, search_parent_directories=subdir, no_scm=no_scm) except SCMError: raise InitError( "{repo} is not tracked by any supported SCM tool (e.g. Git). " "Use `--no-scm` if you don't want to use any SCM or " "`--subdir` if initializing inside a subdirectory of a parent SCM " "repository.".format(repo=root_dir)) if os.path.isdir(dvc_dir): if not force: raise InitError("'{repo}' exists. Use `-f` to force.".format( repo=relpath(dvc_dir))) remove(dvc_dir) os.mkdir(dvc_dir) config = Config.init(dvc_dir) if no_scm: with config.edit() as conf: conf["core"]["no_scm"] = True dvcignore = init_dvcignore(root_dir) proj = Repo(root_dir) proj.plots.templates.init() scm.add( [config.files["repo"], dvcignore, proj.plots.templates.templates_dir]) if scm.ignore_file: scm.add([os.path.join(dvc_dir, scm.ignore_file)]) logger.info("\nYou can now commit the changes to git.\n") return proj
def reproduce( cls, dvc_dir: Optional[str], rev: str, queue: Optional["Queue"] = None, rel_cwd: Optional[str] = None, name: Optional[str] = None, log_level: Optional[int] = None, ) -> "ExecutorResult": """Run dvc repro and return the result. Returns tuple of (exp_hash, exp_ref, force) where exp_hash is the experiment hash (or None on error), exp_ref is the experiment ref, and force is a bool specifying whether or not this experiment should force overwrite any existing duplicates. """ from dvc.repo import Repo from dvc.repo.checkout import checkout as dvc_checkout from dvc.repo.reproduce import reproduce as dvc_reproduce unchanged = [] if queue is not None: queue.put((rev, os.getpid())) if log_level is not None: cls._set_log_level(log_level) def filter_pipeline(stages): unchanged.extend([ stage for stage in stages if isinstance(stage, PipelineStage) ]) exp_hash: Optional[str] = None exp_ref: Optional["ExpRefInfo"] = None repro_force: bool = False try: dvc = Repo(dvc_dir) if dvc_dir is not None: old_cwd = os.getcwd() if rel_cwd: os.chdir(os.path.join(dvc.root_dir, rel_cwd)) else: os.chdir(dvc.root_dir) else: old_cwd = None logger.debug("Running repro in '%s'", os.getcwd()) args_path = os.path.join(dvc.tmp_dir, BaseExecutor.PACKED_ARGS_FILE) if os.path.exists(args_path): args, kwargs = BaseExecutor.unpack_repro_args(args_path) remove(args_path) else: args = [] kwargs = {} repro_force = kwargs.get("force", False) logger.debug("force = %s", str(repro_force)) # NOTE: for checkpoint experiments we handle persist outs slightly # differently than normal: # # - checkpoint out may not yet exist if this is the first time this # experiment has been run, this is not an error condition for # experiments # - at the start of a repro run, we need to remove the persist out # and restore it to its last known (committed) state (which may # be removed/does not yet exist) so that our executor workspace # is not polluted with the (persistent) out from an unrelated # experiment run dvc_checkout(dvc, force=True, quiet=True) checkpoint_func = partial(cls.checkpoint_callback, dvc.scm, name, repro_force) stages = dvc_reproduce( dvc, *args, on_unchanged=filter_pipeline, checkpoint_func=checkpoint_func, **kwargs, ) exp_hash = cls.hash_exp(stages) try: cls.commit( dvc.scm, exp_hash, exp_name=name, force=repro_force, checkpoint=any(stage.is_checkpoint for stage in stages), ) except UnchangedExperimentError: pass ref = dvc.scm.get_ref(EXEC_BRANCH, follow=False) if ref: exp_ref = ExpRefInfo.from_ref(ref) if cls.WARN_UNTRACKED: untracked = dvc.scm.untracked_files() if untracked: logger.warning( "The following untracked files were present in the " "experiment directory after reproduction but will " "not be included in experiment commits:\n" "\t%s", ", ".join(untracked), ) finally: if dvc: dvc.scm.close() if old_cwd: os.chdir(old_cwd) # ideally we would return stages here like a normal repro() call, but # stages is not currently picklable and cannot be returned across # multiprocessing calls return ExecutorResult(exp_hash, exp_ref, repro_force)
def dvc(self): from dvc.repo import Repo return Repo(self.dvc_dir)
def reproduce( cls, dvc_dir: str, queue: "Queue", rev: str, cwd: Optional[str] = None, name: Optional[str] = None, ) -> Tuple[bool, Optional[str]]: """Run dvc repro and return the result. Returns tuple of (exp_hash, force) where exp_hash is the experiment hash (or None on error) and force is a bool specifying whether or not this experiment should force overwrite any existing duplicates. """ unchanged = [] queue.put((rev, os.getpid())) def filter_pipeline(stages): unchanged.extend([ stage for stage in stages if isinstance(stage, PipelineStage) ]) result = None force = False try: dvc = Repo(dvc_dir) old_cwd = os.getcwd() new_cwd = cwd if cwd else dvc.root_dir os.chdir(new_cwd) logger.debug("Running repro in '%s'", cwd) args_path = os.path.join(dvc.tmp_dir, BaseExecutor.PACKED_ARGS_FILE) if os.path.exists(args_path): args, kwargs = BaseExecutor.unpack_repro_args(args_path) remove(args_path) else: args = [] kwargs = {} force = kwargs.get("force", False) # NOTE: for checkpoint experiments we handle persist outs slightly # differently than normal: # # - checkpoint out may not yet exist if this is the first time this # experiment has been run, this is not an error condition for # experiments # - at the start of a repro run, we need to remove the persist out # and restore it to its last known (committed) state (which may # be removed/does not yet exist) so that our executor workspace # is not polluted with the (persistent) out from an unrelated # experiment run dvc.checkout(force=True, quiet=True) checkpoint_func = partial(cls.checkpoint_callback, dvc.scm, name) stages = dvc.reproduce( *args, on_unchanged=filter_pipeline, checkpoint_func=checkpoint_func, **kwargs, ) exp_hash = cls.hash_exp(stages) exp_rev = cls.commit(dvc.scm, exp_hash, exp_name=name) if dvc.scm.get_ref(EXEC_CHECKPOINT): dvc.scm.set_ref(EXEC_CHECKPOINT, exp_rev) except UnchangedExperimentError: pass finally: if dvc: dvc.scm.close() if old_cwd: os.chdir(old_cwd) # ideally we would return stages here like a normal repro() call, but # stages is not currently picklable and cannot be returned across # multiprocessing calls return result, force
def _read_outputs(self, module_temp_dir): from dvc.repo import Repo pkg_repo = Repo(module_temp_dir) stages = pkg_repo.stages() return [out for s in stages for out in s.outs]
def _get_repo(): from dvc.repo import Repo return Repo()
def gc( self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, all_commits=False, all_experiments=False, force=False, jobs=None, repos=None, workspace=False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, # `all_experiments` or `all_branches` are enabled. _raise_error_if_all_disabled( workspace=workspace, all_tags=all_tags, all_commits=all_commits, all_branches=all_branches, all_experiments=all_experiments, ) from contextlib import ExitStack from dvc.repo import Repo if not repos: repos = [] all_repos = [Repo(path) for path in repos] with ExitStack() as stack: for repo in all_repos: stack.enter_context(repo.lock) stack.enter_context(repo.state) used = NamedCache() for repo in all_repos + [self]: used.update( repo.used_cache( all_branches=all_branches, with_deps=with_deps, all_tags=all_tags, all_commits=all_commits, all_experiments=all_experiments, remote=remote, force=force, jobs=jobs, )) for scheme, odb in self.odb.by_scheme(): if not odb: continue removed = odb.gc(set(used.scheme_keys(scheme)), jobs=jobs) if not removed: logger.info(f"No unused '{scheme}' cache to remove.") if not cloud: return remote = self.cloud.get_remote(remote, "gc -c") removed = remote.gc(set(used.scheme_keys(Schemes.LOCAL)), jobs=jobs) if not removed: logger.info("No unused cache to remove from remote.")
def test_show_no_repo(tmp_dir): tmp_dir.gen("metrics.json", '{"foo": 0, "bar": 0.0, "baz": {}}') dvc = Repo(uninitialized=True) dvc.metrics.show(targets=["metrics.json"])
def repo(): return Repo(".")
def stages(): return {stage.relpath for stage in Repo(os.fspath(tmp_dir)).stages}
def external_repo(url, rev=None, for_write=False, cache_dir=None, cache_types=None, **kwargs): from scmrepo.git import Git from dvc.config import NoRemoteError from dvc.fs.git import GitFileSystem logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) # Local HEAD points to the tip of whatever branch we first cloned from # (which may not be the default branch), use origin/HEAD here to get # the tip of the default branch rev = rev or "refs/remotes/origin/HEAD" cache_config = { "cache": { "dir": cache_dir or _get_cache_dir(url), "type": cache_types } } config = _get_remote_config(url) if os.path.isdir(url) else {} config.update(cache_config) if for_write: root_dir = path fs = None else: root_dir = os.path.realpath(path) scm = Git(root_dir) fs = GitFileSystem(scm=scm, rev=rev) repo_kwargs = dict( root_dir=root_dir, url=url, fs=fs, config=config, repo_factory=erepo_factory(url, cache_config), **kwargs, ) if "subrepos" not in repo_kwargs: repo_kwargs["subrepos"] = True if "uninitialized" not in repo_kwargs: repo_kwargs["uninitialized"] = True repo = Repo(**repo_kwargs) try: yield repo except NoRemoteError as exc: raise NoRemoteInExternalRepoError(url) from exc except OutputNotFoundError as exc: if exc.repo is repo: raise NoOutputInExternalRepoError(exc.output, repo.root_dir, url) from exc raise except FileMissingError as exc: raise PathMissingError(exc.path, url) from exc finally: repo.close() if for_write: _remove(path)
def stages(): return set(stage.relpath for stage in Repo(fspath(tmp_dir)).stages)
""" Define paths to all DVC dependency and output files. """ from pathlib import Path from urllib.parse import urlparse import dask from dvc.repo import Repo dvc_repo = Repo('.') # Get DVC remote URL to your remote work directory for this project remote_user_work_path = urlparse( dvc_repo.config.config[f'remote "ahsoka_user_workspace"']['url']).path remote_user_work_path = Path(remote_user_work_path) remote_project_work_path = urlparse( dvc_repo.config.config[f'remote "ahsoka_project_data"']['url']).path remote_project_work_folder_name = Path(remote_project_work_path).name remote_work_path = remote_user_work_path/remote_project_work_folder_name # Get DVC remote URL to the project cache remote_cache_path = urlparse( dvc_repo.config.config[f'remote "ahsoka_project_cache"']['url']).path remote_cache_path = Path(remote_cache_path) assert remote_work_path.name == remote_cache_path.name, ( 'The name of your remote DVC data directory for this project:' f'"{remote_work_path.name}", is should be the same as the same as the name' f'of the project cache directory: "{remote_cache_path.name}') data_dir = remote_work_path
def gc( self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, all_commits=False, all_experiments=False, force=False, jobs=None, repos=None, workspace=False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, # `all_experiments` or `all_branches` are enabled. _raise_error_if_all_disabled( workspace=workspace, all_tags=all_tags, all_commits=all_commits, all_branches=all_branches, all_experiments=all_experiments, ) from contextlib import ExitStack from dvc.objects.db import get_index from dvc.repo import Repo if not repos: repos = [] all_repos = [Repo(path) for path in repos] used_obj_ids = set() with ExitStack() as stack: for repo in all_repos: stack.enter_context(repo.lock) for repo in all_repos + [self]: for obj_ids in repo.used_objs( all_branches=all_branches, with_deps=with_deps, all_tags=all_tags, all_commits=all_commits, all_experiments=all_experiments, remote=remote, force=force, jobs=jobs, ).values(): used_obj_ids.update(obj_ids) for scheme, odb in self.odb.by_scheme(): if not odb: continue removed = odb.gc(used_obj_ids, jobs=jobs) if not removed: logger.info(f"No unused '{scheme}' cache to remove.") if not cloud: return odb = self.cloud.get_remote_odb(remote, "gc -c") removed = odb.gc(used_obj_ids) if removed: get_index(odb).clear() else: logger.info("No unused cache to remove from remote.")
def gc( self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, all_commits=False, force=False, jobs=None, repos=None, workspace=False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, # or `all_branches` are enabled. _raise_error_if_all_disabled( workspace=workspace, all_tags=all_tags, all_commits=all_commits, all_branches=all_branches, cloud=cloud, ) from contextlib import ExitStack from dvc.repo import Repo all_repos = [] if repos: all_repos = [Repo(path) for path in repos] with ExitStack() as stack: for repo in all_repos: stack.enter_context(repo.lock) stack.enter_context(repo.state) used = NamedCache() for repo in all_repos + [self]: used.update( repo.used_cache( all_branches=all_branches, with_deps=with_deps, all_tags=all_tags, all_commits=all_commits, remote=remote, force=force, jobs=jobs, )) _do_gc("local", self.cache.local.gc, used) if self.cache.s3: _do_gc("s3", self.cache.s3.gc, used) if self.cache.gs: _do_gc("gs", self.cache.gs.gc, used) if self.cache.ssh: _do_gc("ssh", self.cache.ssh.gc, used) if self.cache.hdfs: _do_gc("hdfs", self.cache.hdfs.gc, used) if self.cache.azure: _do_gc("azure", self.cache.azure.gc, used) if cloud: _do_gc("remote", self.cloud.get_remote(remote, "gc -c").gc, used)
def exp_dvc(self): """Return clone dvc Repo instance.""" from dvc.repo import Repo return Repo(self.exp_dvc_dir)
def collect_stages(): return { stage.relpath for stage in Repo(os.fspath(tmp_dir)).index.stages }
def external_repo(url, rev=None, for_write=False, cache_dir=None, cache_types=None, **kwargs): from dvc.config import NoRemoteError from dvc.scm.git import Git logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) # Local HEAD points to the tip of whatever branch we first cloned from # (which may not be the default branch), use origin/HEAD here to get # the tip of the default branch rev = rev or "refs/remotes/origin/HEAD" cache_config = { "cache": { "dir": cache_dir or _get_cache_dir(url), "type": cache_types, } } config = _get_remote_config(url) if os.path.isdir(url) else {} config.update(cache_config) def make_repo(path, **_kwargs): _config = cache_config.copy() if os.path.isdir(url): rel = os.path.relpath(path, _kwargs["scm"].root_dir) repo_path = os.path.join(url, rel) _config.update(_get_remote_config(repo_path)) return Repo(path, config=_config, **_kwargs) root_dir = path if for_write else os.path.realpath(path) repo_kwargs = dict( root_dir=root_dir, url=url, scm=None if for_write else Git(root_dir), rev=None if for_write else rev, subrepos=not for_write, uninitialized=True, config=config, repo_factory=make_repo, **kwargs, ) if "fetch" not in repo_kwargs: repo_kwargs["fetch"] = True repo = Repo(**repo_kwargs) try: yield repo except NoRemoteError as exc: raise NoRemoteInExternalRepoError(url) from exc except OutputNotFoundError as exc: if exc.repo is repo: raise NoOutputInExternalRepoError(exc.output, repo.root_dir, url) from exc raise except FileMissingError as exc: raise PathMissingError(exc.path, url) from exc finally: repo.close() if for_write: _remove(path)
def __init__(self, args): from dvc.repo import Repo self.repo = Repo() self.config = self.repo.config self.args = args
def identifier(self) -> str: """Unique identifier for the index. We can use this to optimize and skip opening some indices eg: on push/pull/fetch/gc --all-commits. Currently, it is unique to the platform (windows vs posix). """ return dict_md5(self.dumpd()) if __name__ == "__main__": from funcy import log_durations from dvc.repo import Repo repo = Repo() index = Index(repo, repo.fs) print(index) with log_durations(print, "collecting stages"): # pylint: disable=pointless-statement print("no of stages", len(index.stages)) with log_durations(print, "building graph"): index.build_graph() with log_durations(print, "calculating hash"): print(index.identifier) with log_durations(print, "updating"): index2 = index.update(index.stages) with log_durations(print, "calculating hash"): print(index2.identifier)
def create_dag(data_path: str, out: str) -> None: repo = Repo() repo.run(cmd=f"python train_model.py {data_path} {out}", deps=[data_path], outs=[out], fname=out + ".dvc")