def __init__( self, dvc_dir=None, validate=True, tree=None, config=None, ): # pylint: disable=super-init-not-called from dvc.tree.local import LocalTree self.dvc_dir = dvc_dir if not dvc_dir: try: from dvc.repo import Repo self.dvc_dir = os.path.join(Repo.find_dvc_dir()) except NotDvcRepoError: self.dvc_dir = None else: self.dvc_dir = os.path.abspath(os.path.realpath(dvc_dir)) self.wtree = LocalTree(None, {"url": self.dvc_dir}) self.tree = tree or self.wtree self.load(validate=validate, config=config)
def __init__(self, repo): from dvc.tree.local import LocalTree super().__init__() self.repo = repo self.root_dir = repo.root_dir self.tree = LocalTree(None, {"url": self.root_dir}) state_config = repo.config.get("state", {}) self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT) self.row_cleanup_quota = state_config.get("row_cleanup_quota", self.STATE_ROW_CLEANUP_QUOTA) if not repo.tmp_dir: self.state_file = None return self.state_file = os.path.join(repo.tmp_dir, self.STATE_FILE) # https://www.sqlite.org/tempfiles.html self.temp_files = [ self.state_file + "-journal", self.state_file + "-wal", ] self.database = None self.cursor = None self.inserts = 0
def __init__( self, baseline_rev: str, checkpoint_reset: Optional[bool] = False, **kwargs, ): from dvc.repo import Repo dvc_dir = kwargs.pop("dvc_dir") cache_dir = kwargs.pop("cache_dir") super().__init__(baseline_rev, **kwargs) self.tmp_dir = TemporaryDirectory() # init empty DVC repo (will be overwritten when input is uploaded) Repo.init(root_dir=self.tmp_dir.name, no_scm=True) logger.debug( "Init local executor in dir '%s' with baseline '%s'.", self.tmp_dir, baseline_rev[:7], ) self.dvc_dir = os.path.join(self.tmp_dir.name, dvc_dir) self._config(cache_dir) self._tree = LocalTree(self.dvc, {"url": self.dvc.root_dir}) # override default CACHE_MODE since files must be writable in order # to run repro self._tree.CACHE_MODE = 0o644 self.checkpoint_reset = checkpoint_reset self.checkpoint = False
def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): tmp_dir.gen({"dir": {"subdir": {}}}) top_ignore_file = (tmp_dir / dname).with_name(DvcIgnore.DVCIGNORE_FILE) top_ignore_file.write_text(os.path.basename(dname)) dvc.tree.__dict__.pop("dvcignore", None) ignore_file = tmp_dir / dname / DvcIgnore.DVCIGNORE_FILE ignore_file.write_text("foo") assert len(dvc.tree.dvcignore.ignores) == 3 assert DvcIgnoreDirs([".git", ".hg", ".dvc"]) in dvc.tree.dvcignore.ignores ignore_pattern_trie = None for ignore in dvc.tree.dvcignore.ignores: if isinstance(ignore, DvcIgnorePatternsTrie): ignore_pattern_trie = ignore assert ignore_pattern_trie is not None assert (DvcIgnorePatterns.from_files( os.fspath(top_ignore_file), LocalTree(None, {"url": dvc.root_dir}), ) == ignore_pattern_trie[os.fspath(ignore_file)]) assert any(i for i in dvc.tree.dvcignore.ignores if isinstance(i, DvcIgnoreRepo))
def test_is_protected(tmp_dir, dvc, link_name): tree = LocalTree(dvc, {}) link_method = getattr(tree, link_name) (tmp_dir / "foo").write_text("foo") foo = PathInfo(tmp_dir / "foo") link = PathInfo(tmp_dir / "link") link_method(foo, link) assert not tree.is_protected(foo) assert not tree.is_protected(link) tree.protect(foo) assert tree.is_protected(foo) assert tree.is_protected(link) tree.unprotect(link) assert not tree.is_protected(link) if os.name == "nt" and link_name == "hardlink": # NOTE: NTFS doesn't allow deleting read-only files, which forces us to # set write perms on the link, which propagates to the source. assert not tree.is_protected(foo) else: assert tree.is_protected(foo)
def test_track_from_multiple_files(tmp_dir): d1 = {"Train": {"us": {"lr": 10}}} d2 = {"Train": {"us": {"layers": 100}}} tree = LocalTree(None, config={}) path1 = tmp_dir / "params.yaml" path2 = tmp_dir / "params2.yaml" dump_yaml(path1, d1, tree) dump_yaml(path2, d2, tree) context = Context.load_from(tree, path1) c = Context.load_from(tree, path2) context.merge_update(c) def key_tracked(d, path, key): return key in d[relpath(path)] with context.track() as tracked: context.select("Train") assert not ( key_tracked(tracked, path1, "Train") or key_tracked(tracked, path2, "Train") ) context.select("Train.us") assert not ( key_tracked(tracked, path1, "Train.us") or key_tracked(tracked, path2, "Train.us") ) context.select("Train.us.lr") assert key_tracked(tracked, path1, "Train.us.lr") and not key_tracked( tracked, path2, "Train.us.lr" ) context.select("Train.us.layers") assert not key_tracked( tracked, path1, "Train.us.layers" ) and key_tracked(tracked, path2, "Train.us.layers") context = Context.clone(context) assert not context._tracked_data # let's see with an alias context["us"] = context["Train"]["us"] with context.track() as tracked: context.select("us") assert not ( key_tracked(tracked, path1, "Train.us") or key_tracked(tracked, path2, "Train.us") ) context.select("us.lr") assert key_tracked(tracked, path1, "Train.us.lr") and not key_tracked( tracked, path2, "Train.us.lr" ) context.select("Train.us.layers") assert not key_tracked( tracked, path1, "Train.us.layers" ) and key_tracked(tracked, path2, "Train.us.layers")
def test_protect_ignore_erofs(tmp_dir, mocker): tmp_dir.gen("foo", "foo") foo = PathInfo("foo") tree = LocalTree(None, {}) mock_chmod = mocker.patch("os.chmod", side_effect=OSError(errno.EROFS, "read-only fs")) tree.protect(foo) assert mock_chmod.called
def test_protect_ignore_errors(tmp_dir, mocker, err): tmp_dir.gen("foo", "foo") foo = PathInfo("foo") tree = LocalTree(None, {}) tree.protect(foo) mock_chmod = mocker.patch("os.chmod", side_effect=OSError(err, "something")) tree.protect(foo) assert mock_chmod.called
def test_nobranch(self): tree = LocalTree(None, {"url": self._root_dir}, use_dvcignore=True) self.assertWalkEqual( tree.walk("."), [ (".", ["data_dir"], ["bar", "тест", "code.py", "foo"]), (join("data_dir"), ["data_sub_dir"], ["data"]), (join("data_dir", "data_sub_dir"), [], ["data_sub"]), ], ) self.assertWalkEqual( tree.walk(join("data_dir", "data_sub_dir")), [(join("data_dir", "data_sub_dir"), [], ["data_sub"])], )
def test_path_object_and_str_are_valid_types_get_mtime_and_size(tmp_dir): tmp_dir.gen( {"dir": {"dir_file": "dir file content"}, "file": "file_content"} ) tree = LocalTree(None, {"url": os.fspath(tmp_dir)}, use_dvcignore=True) time, size = get_mtime_and_size("dir", tree) object_time, object_size = get_mtime_and_size(PathInfo("dir"), tree) assert time == object_time assert size == object_size time, size = get_mtime_and_size("file", tree) object_time, object_size = get_mtime_and_size(PathInfo("file"), tree) assert time == object_time assert size == object_size
def test(self): tree = LocalTree(None, {"url": self.root_dir}, use_dvcignore=True) file_time, file_size = get_mtime_and_size(self.DATA, tree) dir_time, dir_size = get_mtime_and_size(self.DATA_DIR, tree) actual_file_size = os.path.getsize(self.DATA) actual_dir_size = os.path.getsize(self.DATA) + os.path.getsize( self.DATA_SUB ) self.assertIs(type(file_time), str) self.assertIs(type(file_size), str) self.assertEqual(file_size, str(actual_file_size)) self.assertIs(type(dir_time), str) self.assertIs(type(dir_size), str) self.assertEqual(dir_size, str(actual_dir_size))
def test_track(tmp_dir): d = { "lst": [ { "foo0": "foo0", "bar0": "bar0" }, { "foo1": "foo1", "bar1": "bar1" }, ], "dct": { "foo": "foo", "bar": "bar", "baz": "baz" }, } tree = LocalTree(None, config={}) path = tmp_dir / "params.yaml" dump_yaml(path, d, tree) context = Context.load_from(tree, path) def key_tracked(d, key): assert len(d) == 1 return key in d[relpath(path)] with context.track() as tracked: context.select("lst") assert key_tracked(tracked, "lst") context.select("dct") assert not key_tracked(tracked, "dct") context.select("dct.foo") assert key_tracked(tracked, "dct.foo") # Currently, it's unable to track dictionaries, as it can be merged # from multiple sources. context.select("lst.0") assert not key_tracked(tracked, "lst.0") # FIXME: either support tracking list values in ParamsDependency # or, prevent this from being tracked. context.select("lst.0.foo0") assert key_tracked(tracked, "lst.0.foo0")
def test(self): tree = LocalTree(None, {"url": self._root_dir}) self.assertWalkEqual( tree.walk(self._root_dir), [ ( self._root_dir, ["data_dir"], ["code.py", "bar", "тест", "foo"], ), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), ( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], ), ], )
def test_status_download_optimization(mocker, dvc): """When comparing the status to pull a remote cache, And the desired files to fetch are already on the local cache, Don't check the existence of the desired files on the remote cache """ cache = LocalCache(LocalTree(dvc, {})) infos = NamedCache() infos.add("local", "acbd18db4cc2f85cedef654fccc4a4d8", "foo") infos.add("local", "37b51d194a7513e45b56f6524f2d51f2", "bar") local_exists = list(infos["local"]) mocker.patch.object(cache, "hashes_exist", return_value=local_exists) other_remote = mocker.Mock() other_remote.url = "other_remote" other_remote.hashes_exist.return_value = [] other_remote.index = RemoteIndexNoop() cache.status(infos, other_remote, download=True) assert other_remote.hashes_exist.call_count == 0
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree from dvc.utils.fs import makedirs try: tree = scm.get_tree(rev) if rev else None self.root_dir = self.find_root(root_dir, tree) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise self.root_dir = SCM(root_dir or os.curdir).root_dir self.dvc_dir = None self.tmp_dir = None tree_kwargs = dict(use_dvcignore=True, dvcignore_root=self.root_dir) if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self)
def setUp(self): super().setUp() self.tree = LocalTree(None, {})
def tree(self): if self.scm: return self.scm.get_tree(self.rev) return LocalTree(self, {"url": self.root_dir})
def test_subdir(self): tree = LocalTree(None, {"url": self._root_dir}) self.assertWalkEqual( tree.walk(join("data_dir", "data_sub_dir")), [(join("data_dir", "data_sub_dir"), [], ["data_sub"])], )
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized ) tree_kwargs = {"use_dvcignore": True, "dvcignore_root": self.root_dir} if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.stage_collection_error_handler = None self._lock_depth = 0
def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.tree.local import LocalTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = scm.get_tree( rev, use_dvcignore=True, dvcignore_root=self.root_dir ) self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.tree = LocalTree( self, {"url": self.root_dir}, use_dvcignore=True, dvcignore_root=self.root_dir, ) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore()