def test_show_running_workspace(tmp_dir, scm, dvc, exp_stage, capsys): pid_dir = os.path.join(dvc.tmp_dir, dvc.experiments.EXEC_PID_DIR) makedirs(pid_dir, True) info = ExecutorInfo(None, None, None, BaseExecutor.DEFAULT_LOCATION) pidfile = os.path.join(pid_dir, f"workspace{BaseExecutor.PIDFILE_EXT}") (tmp_dir / pidfile).dump(info.to_dict()) assert dvc.experiments.show()["workspace"] == { "baseline": { "data": { "metrics": {"metrics.yaml": {"data": {"foo": 1}}}, "params": {"params.yaml": {"data": {"foo": 1}}}, "queued": False, "running": True, "executor": info.location, "timestamp": None, } } } capsys.readouterr() assert main(["exp", "show", "--no-pager"]) == 0 cap = capsys.readouterr() assert "Running" in cap.out assert info.location in cap.out
def load(self): """(Re)load this index database.""" retries = 1 while True: assert self.database is None assert self.cursor is None empty = not os.path.isfile(self.path) makedirs(os.path.dirname(self.path), exist_ok=True) self.database = _connect_sqlite(self.path, {"nolock": 1}) self.cursor = self.database.cursor() try: self._prepare_db(empty=empty) return except sqlite3.DatabaseError: self.cursor.close() self.database.close() self.database = None self.cursor = None if retries > 0: os.unlink(self.path) retries -= 1 else: raise
def _get_repo_dirs( self, root_dir: str = None, scm: Base = None, rev: str = None, uninitialized: bool = False, ): assert bool(scm) == bool(rev) from dvc.scm import SCM from dvc.scm.git import Git from dvc.utils.fs import makedirs dvc_dir = None tmp_dir = None try: tree = scm.get_tree(rev) if isinstance(scm, Git) and rev else None root_dir = self.find_root(root_dir, tree) dvc_dir = os.path.join(root_dir, self.DVC_DIR) tmp_dir = os.path.join(dvc_dir, "tmp") makedirs(tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise try: scm = SCM(root_dir or os.curdir) except (SCMError, InvalidGitRepositoryError): scm = SCM(os.curdir, no_scm=True) assert isinstance(scm, Base) root_dir = scm.root_dir return root_dir, dvc_dir, tmp_dir
def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: # use GitTree instead of WorkingTree as default repo tree instance tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = tree self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.tree = WorkingTree(self.root_dir) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self._ignore()
def _get_database_dir(self, db_name): # NOTE: by default, store SQLite-based remote indexes and state's # `links` and `md5s` caches in the repository itself to avoid any # possible state corruption in 'shared cache dir' scenario, but allow # user to override this through config when, say, the repository is # located on a mounted volume — see # https://github.com/iterative/dvc/issues/4420 base_db_dir = self.config.get(db_name, {}).get("dir", None) if not base_db_dir: return self.tmp_dir import hashlib from dvc.utils.fs import makedirs root_dir_hash = hashlib.sha224( self.root_dir.encode("utf-8") ).hexdigest() db_dir = os.path.join( base_db_dir, self.DVC_DIR, f"{os.path.basename(self.root_dir)}-{root_dir_hash[0:7]}", ) makedirs(db_dir, exist_ok=True) return db_dir
def test_subrepos_are_ignored(tmp_dir, erepo_dir): subrepo = erepo_dir / "dir" / "subrepo" make_subrepo(subrepo, erepo_dir.scm) with erepo_dir.chdir(): erepo_dir.dvc_gen("dir/foo", "foo", commit="foo") erepo_dir.scm_gen("dir/bar", "bar", commit="bar") with subrepo.chdir(): subrepo.dvc_gen({"file": "file"}, commit="add files on subrepo") with external_repo(os.fspath(erepo_dir)) as repo: repo.get_external("dir", "out") expected_files = {"foo": "foo", "bar": "bar", ".gitignore": "/foo\n"} assert (tmp_dir / "out").read_text() == expected_files expected_hash = HashInfo("md5", "e1d9e8eae5374860ae025ec84cfd85c7.dir") assert (repo.repo_tree.get_hash( os.path.join(repo.root_dir, "dir"), follow_subrepos=False) == expected_hash) # clear cache to test `fetch_external` again cache_dir = tmp_dir / repo.cache.local.cache_dir remove(cache_dir) makedirs(cache_dir) assert repo.fetch_external(["dir"]) == ( len(expected_files), 0, [expected_hash], )
def dump_json(self, filename: str): from dvc.utils.fs import makedirs from dvc.utils.serialize import modify_json makedirs(os.path.dirname(filename), exist_ok=True) with modify_json(filename) as d: d.update(self.asdict())
def _init_executors(self, to_run): from dvc.utils.fs import makedirs from .executor.local import TempDirExecutor executors = {} base_tmp_dir = os.path.join(self.repo.tmp_dir, self.EXEC_TMP_DIR) if not os.path.exists(base_tmp_dir): makedirs(base_tmp_dir) for stash_rev, item in to_run.items(): self.scm.set_ref(EXEC_HEAD, item.rev) self.scm.set_ref(EXEC_MERGE, stash_rev) self.scm.set_ref(EXEC_BASELINE, item.baseline_rev) # Executor will be initialized with an empty git repo that # we populate by pushing: # EXEC_HEAD - the base commit for this experiment # EXEC_MERGE - the unmerged changes (from our stash) # to be reproduced # EXEC_BASELINE - the baseline commit for this experiment executor = TempDirExecutor( self.scm, self.dvc_dir, name=item.name, branch=item.branch, tmp_dir=base_tmp_dir, cache_dir=self.repo.odb.local.cache_dir, ) executors[stash_rev] = executor for ref in (EXEC_HEAD, EXEC_MERGE, EXEC_BASELINE): self.scm.remove_ref(ref) return executors
def _find_or_create_user_id(): """ The user's ID is stored on a file under the global config directory. The file should contain a JSON with a "user_id" key: {"user_id": "16fd2706-8baf-433b-82eb-8c7fada847da"} IDs are generated randomly with UUID. """ config_dir = Config.get_global_config_dir() fname = os.path.join(config_dir, "user_id") lockfile = os.path.join(config_dir, "user_id.lock") # Since the `fname` and `lockfile` are under the global config, # we need to make sure such directory exist already. makedirs(config_dir, exist_ok=True) try: with Lock(lockfile): try: with open(fname, "r") as fobj: user_id = json.load(fobj)["user_id"] except (FileNotFoundError, ValueError, KeyError): user_id = str(uuid.uuid4()) with open(fname, "w") as fobj: json.dump({"user_id": user_id}, fobj) return user_id except LockError: logger.debug( "Failed to acquire '{lockfile}'".format(lockfile=lockfile))
def __init__(self, dvc_dir): self.dvc_dir = dvc_dir if not os.path.exists(self.templates_dir): makedirs(self.templates_dir, exist_ok=True) for t in self.TEMPLATES: self.dump(t())
def _get_repo_dirs( self, root_dir: str = None, fs: "FileSystem" = None, uninitialized: bool = False, ): from dvc.scm import SCM, Base, SCMError from dvc.utils.fs import makedirs dvc_dir = None tmp_dir = None try: root_dir = self.find_root(root_dir, fs) dvc_dir = os.path.join(root_dir, self.DVC_DIR) tmp_dir = os.path.join(dvc_dir, "tmp") makedirs(tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise try: scm = SCM(root_dir or os.curdir) except SCMError: scm = SCM(os.curdir, no_scm=True) assert isinstance(scm, Base) root_dir = scm.root_dir return root_dir, dvc_dir, tmp_dir
def _get_repo_dirs( self, root_dir: str = None, scm: "Base" = None, rev: str = None, uninitialized: bool = False, ): assert bool(scm) == bool(rev) from dvc.fs.scm import GitFileSystem from dvc.scm import SCM, Base, Git, SCMError from dvc.utils.fs import makedirs dvc_dir = None tmp_dir = None try: fs = (GitFileSystem(scm=scm, rev=rev) if isinstance(scm, Git) and rev else None) root_dir = self.find_root(root_dir, fs) dvc_dir = os.path.join(root_dir, self.DVC_DIR) tmp_dir = os.path.join(dvc_dir, "tmp") makedirs(tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise try: scm = SCM(root_dir or os.curdir) except SCMError: scm = SCM(os.curdir, no_scm=True) assert isinstance(scm, Base) root_dir = scm.root_dir return root_dir, dvc_dir, tmp_dir
def test_makedirs(tmp_dir): path = os.path.join(tmp_dir, "directory") path_info = PathInfo(os.path.join(tmp_dir, "another", "directory")) makedirs(path) assert os.path.isdir(path) makedirs(path_info) assert os.path.isdir(path_info)
def put_file(self, from_file, to_info, callback=DEFAULT_CALLBACK, **kwargs): makedirs(to_info.parent, exist_ok=True) tmp_file = tmp_fname(to_info) copyfile(from_file, tmp_file, callback=callback) os.replace(tmp_file, to_info)
def _upload( self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs ): makedirs(to_info.parent, exist_ok=True) tmp_file = tmp_fname(to_info) copyfile( from_file, tmp_file, name=name, no_progress_bar=no_progress_bar ) os.rename(tmp_file, fspath_py35(to_info))
def test_get_to_dir(tmp_dir, erepo_dir, dname): with erepo_dir.chdir(): erepo_dir.dvc_gen("file", "contents", commit="create file") makedirs(dname, exist_ok=True) Repo.get(fspath(erepo_dir), "file", dname) assert (tmp_dir / dname).is_dir() assert (tmp_dir / dname / "file").read_text() == "contents"
def put_file(self, from_file, to_info, callback=DEFAULT_CALLBACK, **kwargs): parent = self.path.parent(to_info) makedirs(parent, exist_ok=True) tmp_file = self.path.join(parent, tmp_fname()) copyfile(from_file, tmp_file, callback=callback) os.replace(tmp_file, to_info)
def download(self, hdfs_path, local_path, **kwargs): from dvc.utils.fs import makedirs kwargs.setdefault("chunk_size", 2**16) makedirs(os.path.dirname(local_path), exist_ok=True) with open(local_path, "wb") as writer: with self.read(hdfs_path, **kwargs) as reader: for chunk in reader: writer.write(chunk)
def make_tpi(self, name: str): from tpi import TPIError from tpi.terraform import TerraformBackend as TPIBackend try: working_dir = os.path.join(self.tmp_dir, name) makedirs(working_dir, exist_ok=True) yield TPIBackend(working_dir=working_dir) except TPIError as exc: raise DvcException("TPI operation failed") from exc
def test_makedirs_permissions(tmp_dir): dir_mode = 0o755 intermediate_dir = "тестовая-директория" test_dir = os.path.join(intermediate_dir, "data") assert not os.path.exists(intermediate_dir) makedirs(test_dir, mode=dir_mode) assert stat.S_IMODE(os.stat(test_dir).st_mode) == dir_mode assert stat.S_IMODE(os.stat(intermediate_dir).st_mode) == dir_mode
def _download_file(self, from_info, to_info, name, no_progress_bar): makedirs(to_info.parent, exist_ok=True) logger.debug("Downloading '%s' to '%s'", from_info, to_info) name = name or to_info.name tmp_file = tmp_fname(to_info) self._download( # noqa, pylint: disable=no-member from_info, tmp_file, name=name, no_progress_bar=no_progress_bar ) move(tmp_file, to_info)
def test_import_to_dir(dname, tmp_dir, dvc, erepo_dir): makedirs(dname, exist_ok=True) with erepo_dir.chdir(): erepo_dir.dvc_gen("foo", "foo content", commit="create foo") stage = dvc.imp(os.fspath(erepo_dir), "foo", dname) dst = os.path.join(dname, "foo") assert stage.outs[0].fspath == os.path.abspath(dst) assert os.path.isdir(dname) assert (tmp_dir / dst).read_text() == "foo content"
def pack_repro_args(path, *args, tree=None, **kwargs): dpath = os.path.dirname(path) if tree: open_func = tree.open tree.makedirs(dpath) else: from dvc.utils.fs import makedirs open_func = open makedirs(dpath, exist_ok=True) data = {"args": args, "kwargs": kwargs} with open_func(path, "wb") as fobj: pickle.dump(data, fobj)
def test_import_url_to_dir(dname, tmp_dir, dvc): tmp_dir.gen({"data_dir": {"file": "file content"}}) src = os.path.join("data_dir", "file") makedirs(dname, exist_ok=True) stage = dvc.imp_url(src, dname) dst = tmp_dir / dname / "file" assert stage.outs[0].fs_path == os.fspath(dst) assert os.path.isdir(dname) assert dst.read_text() == "file content"
def __init__( self, scm: "Git", wdir: str, ): from dvc.utils.fs import makedirs self.scm = scm makedirs(wdir, exist_ok=True) self.wdir = wdir self.proc = ProcessManager(self.pid_dir) self._attached: Dict[str, "BaseExecutor"] = {} self._detached: Dict[str, "BaseExecutor"] = dict(self._load_infos()) self._queue: Deque[Tuple[str, "BaseExecutor"]] = deque()
def __init__( self, tmp_dir: "StrPath", name: str, ): # pylint: disable=super-init-not-called from diskcache import Index from dvc.fs.local import LocalFileSystem from dvc.utils.fs import makedirs self.index_dir = os.path.join(tmp_dir, self.INDEX_DIR, name) makedirs(self.index_dir, exist_ok=True) self.fs = LocalFileSystem() self.index = Index(self.index_dir)
def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag from dvc.utils.fs import makedirs root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) hardlink_lock = self.config.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.dvc_dir, "lock"), tmp_dir=os.path.join(self.dvc_dir, "tmp"), hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] level = core.get(Config.SECTION_CORE_LOGLEVEL) if level: logger.setLevel(level.upper()) self.cache = Cache(self) self.cloud = DataCloud(self) self.metrics = Metrics(self) self.tag = Tag(self) self._ignore()
def dump(self): makedirs(self.plot_templates_dir, exist_ok=True) with open( os.path.join(self.plot_templates_dir, self.TEMPLATE_NAME + self.EXTENSION), "w", ) as fobj: json.dump( self.DEFAULT_CONTENT, fobj, indent=self.INDENT, separators=self.SEPARATORS, )
def test_show_running_workspace(tmp_dir, scm, dvc, exp_stage, capsys): pid_dir = os.path.join(dvc.tmp_dir, EXEC_TMP_DIR, EXEC_PID_DIR) info = make_executor_info(location=BaseExecutor.DEFAULT_LOCATION) pidfile = os.path.join( pid_dir, "workspace", f"workspace{BaseExecutor.INFOFILE_EXT}", ) makedirs(os.path.dirname(pidfile), True) (tmp_dir / pidfile).dump_json(info.asdict()) assert dvc.experiments.show()["workspace"] == { "baseline": { "data": { "deps": { "copy.py": { "hash": ANY, "size": ANY, "nfiles": None, } }, "metrics": { "metrics.yaml": { "data": { "foo": 1 } } }, "params": { "params.yaml": { "data": { "foo": 1 } } }, "outs": {}, "queued": False, "running": True, "executor": info.location, "timestamp": None, } } } capsys.readouterr() assert main(["exp", "show", "--csv"]) == 0 cap = capsys.readouterr() assert "Running" in cap.out assert info.location in cap.out
def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.cache = Cache(self) self.cloud = DataCloud(self) self.stage_cache = StageCache(self.cache.local.cache_dir) self.metrics = Metrics(self) self.params = Params(self) self._ignore()