def test_load_stage(dvc, stage_data, lock_data): dvcfile = Dvcfile(dvc, PIPELINE_FILE) stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data) assert stage.wdir == os.path.abspath(os.curdir) assert stage.name == "stage-1" assert stage.cmd == "command" assert stage.path == os.path.abspath(PIPELINE_FILE) assert stage.deps[0].def_path == "foo" assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].def_path == "bar" assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def _create_stages( repo, targets, fname, pbar=None, external=False, glob=False, desc=None, transfer=False, **kwargs, ): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta expanded_targets = glob_targets(targets, glob=glob) stages = [] for out in Tqdm( expanded_targets, desc="Creating DVC files", disable=len(expanded_targets) < LARGE_DIR_SIZE, unit="file", ): if kwargs.get("out"): out = resolve_output(out, kwargs["out"]) path, wdir, out = resolve_paths( repo, out, always_local=transfer and not kwargs.get("out") ) stage = create_stage( Stage, repo, fname or path, wdir=wdir, outs=[out], external=external, ) restore_meta(stage) Dvcfile(repo, stage.path).remove() if desc: stage.outs[0].desc = desc repo._reset() # pylint: disable=protected-access if not stage: if pbar is not None: pbar.total -= 1 continue stages.append(stage) if pbar is not None: pbar.update_msg(out) return stages
def test_load_all_singlestage(tmp_dir, dvc): tmp_dir.gen("foo", "foo") stage1 = dvc.run( cmd="cp foo foo2", deps=["foo"], metrics=["foo2"], always_changed=True, single_stage=True, ) stages = Dvcfile(dvc, "foo2.dvc").stages.values() assert len(stages) == 1 assert list(stages) == [stage1]
def dvcfile(self): if self.path and self._dvcfile and self.path == self._dvcfile.path: return self._dvcfile if not self.path: raise DvcException("Stage does not have any path set " "and is detached from dvcfile.") from dvc.dvcfile import Dvcfile self._dvcfile = Dvcfile(self.repo, self.path) return self._dvcfile
def test_load_stage_with_params(dvc, stage_data, lock_data): lock_data["params"] = {"params.yaml": {"lorem": "ipsum"}} stage_data["params"] = ["lorem"] dvcfile = Dvcfile(dvc, PIPELINE_FILE) stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data) params, deps = split_params_deps(stage) assert deps[0].def_path == "foo" and stage.outs[0].def_path == "bar" assert params[0].def_path == "params.yaml" assert params[0].hash_info == HashInfo("params", {"lorem": "ipsum"}) assert deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_has_stage_with_name(tmp_dir, dvc): tmp_dir.gen("foo", "foo") dvc.run( cmd="cp foo foo2", deps=["foo"], name="copy-foo-foo2", metrics=["foo2"], always_changed=True, ) dvcfile = Dvcfile(dvc, PIPELINE_FILE) assert "copy-foo-foo2" in dvcfile.stages assert "copy" not in dvcfile.stages
def _create_stages( repo, targets, fname, pbar=None, external=False, glob=False, desc=None, ): from glob import iglob from dvc.stage import Stage, create_stage, restore_meta if glob: expanded_targets = [ exp_target for target in targets for exp_target in iglob(target, recursive=True) ] else: expanded_targets = targets stages = [] for out in Tqdm( expanded_targets, desc="Creating DVC-files", disable=len(expanded_targets) < LARGE_DIR_SIZE, unit="file", ): path, wdir, out = resolve_paths(repo, out) stage = create_stage( Stage, repo, fname or path, wdir=wdir, outs=[out], external=external, ) restore_meta(stage) if stage.can_be_skipped: stage = None else: Dvcfile(repo, stage.path).remove() if desc: stage.outs[0].desc = desc repo._reset() # pylint: disable=protected-access if not stage: if pbar is not None: pbar.total -= 1 continue stages.append(stage) if pbar is not None: pbar.update_msg(out) return stages
def run(self, fname=None, no_exec=False, single_stage=False, **kwargs): from dvc.stage import PipelineStage, Stage, create_stage from dvc.dvcfile import Dvcfile, PIPELINE_FILE if not kwargs.get("cmd"): raise InvalidArgumentError("command is not specified") stage_cls = PipelineStage path = PIPELINE_FILE stage_name = kwargs.get("name") if stage_name and single_stage: raise InvalidArgumentError( "`-n|--name` is incompatible with `--single-stage`") if not stage_name and not single_stage: raise InvalidArgumentError("`-n|--name` is required") if single_stage: kwargs.pop("name", None) stage_cls = Stage path = fname or _get_file_path(kwargs) else: if not is_valid_name(stage_name): raise InvalidStageName stage = create_stage(stage_cls, repo=self, path=path, **kwargs) if stage is None: return None dvcfile = Dvcfile(self, stage.path) if dvcfile.exists(): if stage_name and stage_name in dvcfile.stages: raise DuplicateStageName(stage_name, dvcfile) if stage_cls != PipelineStage: dvcfile.remove_with_prompt(force=kwargs.get("overwrite", True)) try: self.check_modified_graph([stage]) except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if no_exec: stage.ignore_outs() else: stage.run( no_commit=kwargs.get("no_commit", False), run_cache=kwargs.get("run_cache", True), ) dvcfile.dump(stage, update_pipeline=True, no_lock=no_exec) return stage
def test_try_loading_dvcfile_that_is_gitignored(tmp_dir, dvc, scm, file): with open(tmp_dir / ".gitignore", "a+", encoding="utf-8") as fd: fd.write(file) # create a file just to avoid other checks (tmp_dir / file).write_text("") scm._reset() dvcfile = Dvcfile(dvc, file) with pytest.raises(FileIsGitIgnored) as exc_info: dvcfile._load() assert str(exc_info.value) == f"bad DVC file name '{file}' is git-ignored."
def load_one(self, path: str = None, name: str = None) -> "Stage": """Load a single stage from a file. Args: path: if not provided, default `dvc.yaml` is assumed. name: required for `dvc.yaml` files, ignored for `.dvc` files. """ path = self._get_filepath(path, name) dvcfile = Dvcfile(self.repo, path) stages = dvcfile.stages # type: ignore return stages[name]
def test_try_get_single_stage_from_pipeline_file(tmp_dir, dvc): from dvc.dvcfile import DvcException tmp_dir.gen("foo", "foo") dvc.run( cmd="cp foo foo2", deps=["foo"], name="copy-foo-foo2", metrics=["foo2"], always_changed=True, ) with pytest.raises(DvcException): assert Dvcfile(dvc, PIPELINE_FILE).stage
def add(self, tag, target=None, with_deps=False, recursive=False): stages = self.collect(target, with_deps=with_deps, recursive=recursive) for stage in stages: changed = False for out in stage.outs: if not out.info: logger.warning("missing checksum info for '{}'".format(out)) continue out.tags[tag] = copy(out.info) changed = True if changed: dvcfile = Dvcfile(self, stage.path) dvcfile.dump(stage)
def remove(self, tag, target=None, with_deps=False, recursive=False): stages = self.collect(target, with_deps=with_deps, recursive=recursive) for stage in stages: changed = False for out in stage.outs: if tag not in out.tags.keys(): logger.warning("tag '{}' not found for '{}'".format(tag, out)) continue del out.tags[tag] changed = True if changed: dvcfile = Dvcfile(self, stage.path) dvcfile.dump(stage)
def test_load_all_multistage(tmp_dir, dvc): tmp_dir.gen("foo", "foo") stage1 = dvc.run( cmd="cp foo foo2", deps=["foo"], name="copy-foo-foo2", metrics=["foo2"], always_changed=True, ) stages = Dvcfile(dvc, PIPELINE_FILE).stages.values() assert len(stages) == 1 assert list(stages) == [stage1] tmp_dir.gen("bar", "bar") stage2 = dvc.run( cmd="cp bar bar2", deps=["bar"], name="copy-bar-bar2", metrics=["bar2"], always_changed=True, ) assert set(Dvcfile(dvc, PIPELINE_FILE).stages.values()) == {stage2, stage1}
def test_force_with_dependencies(self): run_out = self.dvc.run( fname="datetime.dvc", deps=[self.FOO], outs=["datetime.txt"], cmd='python -c "import time; print(time.time())" > datetime.txt', ).outs[0] ret = main(["repro", "--force", "datetime.dvc"]) self.assertEqual(ret, 0) repro_out = Dvcfile(self.dvc, "datetime.dvc").load().outs[0] self.assertNotEqual(run_out.checksum, repro_out.checksum)
def test_download_error_pulling_imported_stage(tmp_dir, dvc, erepo_dir): with erepo_dir.chdir(): erepo_dir.dvc_gen("foo", "foo content", commit="create foo") dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported") dst_stage = Dvcfile(dvc, "foo_imported.dvc").stage dst_cache = dst_stage.outs[0].cache_path remove("foo_imported") remove(dst_cache) with patch("dvc.fs.local.LocalFileSystem._download", side_effect=Exception), pytest.raises(DownloadError): dvc.pull(["foo_imported.dvc"])
def test_multistage_with_wdir(tmp_dir, dvc): from dvc.dvcfile import Dvcfile tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) stage = dvc.run( cmd="cp foo foo1", deps=["foo"], name="copy-foo1-foo2", outs=["foo1"], wdir="dir", ) data, _ = Dvcfile(dvc, stage.path)._load() assert "dir" == data["stages"]["copy-foo1-foo2"]["wdir"]
def test_pull_imported_stage(tmp_dir, dvc, erepo_dir): with erepo_dir.chdir(): erepo_dir.dvc_gen("foo", "foo content", commit="create foo") dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported") dst_stage = Dvcfile(dvc, "foo_imported.dvc").stage dst_cache = dst_stage.outs[0].cache_path remove("foo_imported") remove(dst_cache) dvc.pull(["foo_imported.dvc"]) assert os.path.isfile("foo_imported") assert os.path.isfile(dst_cache)
def test_remove_stage_removes_dvcfiles_if_no_stages_left( tmp_dir, dvc, run_copy): tmp_dir.gen("foo", "foo") run_copy("foo", "bar", name="run_copy") dvc_file = Dvcfile(dvc, PIPELINE_FILE) assert dvc_file.exists() assert (tmp_dir / PIPELINE_LOCK).exists() assert (tmp_dir / "foo").exists() dvc_file.remove_stage(dvc_file.stages["run_copy"]) assert not dvc_file.exists() assert not (tmp_dir / PIPELINE_LOCK).exists()
def test_multistage_always_changed(tmp_dir, dvc): from dvc.dvcfile import Dvcfile tmp_dir.gen({"foo": "foo", "bar": "bar"}) stage = dvc.run( cmd="cp foo foo1", deps=["foo"], name="copy-foo1-foo2", outs=["foo1"], always_changed=True, ) data, _ = Dvcfile(dvc, stage.path)._load() assert data["stages"]["copy-foo1-foo2"]["always_changed"]
def imp_url(self, url, out=None, fname=None, erepo=None, frozen=True, no_exec=False): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage out = resolve_output(url, out) path, wdir, out = resolve_paths(self, out) # NOTE: when user is importing something from within their own repository if (erepo is None and os.path.exists(url) and path_isin(os.path.abspath(url), self.root_dir)): url = relpath(url, wdir) stage = create_stage( Stage, self, fname or path, wdir=wdir, deps=[url], outs=[out], erepo=erepo, ) if stage is None: return None dvcfile = Dvcfile(self, stage.path) dvcfile.remove() try: self.check_modified_graph([stage]) except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if no_exec: stage.ignore_outs() else: stage.run() stage.frozen = frozen dvcfile.dump(stage) return stage
def test_gc_not_collect_pipeline_tracked_files(tmp_dir, dvc, run_copy): from dvc.dvcfile import PIPELINE_FILE, Dvcfile tmp_dir.gen("foo", "foo") tmp_dir.gen("bar", "bar") run_copy("foo", "foo2", name="copy") assert _count_files(dvc.cache.local.cache_dir) == 1 dvc.gc(workspace=True, force=True) assert _count_files(dvc.cache.local.cache_dir) == 1 # remove pipeline file and lockfile and check Dvcfile(dvc, PIPELINE_FILE).remove(force=True) dvc.gc(workspace=True, force=True) assert _count_files(dvc.cache.local.cache_dir) == 0
def _collect_stages(self): from dvc.dvcfile import Dvcfile, is_valid_filename stages = [] outs = set() for root, dirs, files in self.tree.walk(self.root_dir): for file_name in filter(is_valid_filename, files): path = os.path.join(root, file_name) stages.extend(list(Dvcfile(self, path).stages.values())) outs.update(out.fspath for stage in stages for out in (out for out in stage.outs if out.scheme == "local")) dirs[:] = [d for d in dirs if os.path.join(root, d) not in outs] return stages
def test_remove_stage_dvcfiles(tmp_dir, dvc, run_copy): tmp_dir.gen("foo", "foo") stage = run_copy("foo", "bar", single_stage=True) dvc_file = Dvcfile(dvc, stage.path) assert dvc_file.exists() dvc_file.remove_stage(stage) assert not dvc_file.exists() # re-check to see if it fails if there's no stage entry dvc_file.remove_stage(stage) dvc_file.remove(force=True) # should not fail when there's no file at all. dvc_file.remove_stage(stage)
def test_meta_is_preserved(tmp_dir, dvc): (stage, ) = tmp_dir.dvc_gen("foo", "foo content") # Add meta to DVC-file data = load_stage_file(stage.path) data["meta"] = {"custom_key": 42} dump_stage_file(stage.path, data) # Loading and dumping to test that it works and meta is retained dvcfile = Dvcfile(dvc, stage.path) new_stage = dvcfile.load() dvcfile.dump(new_stage) new_data = load_stage_file(stage.path) assert new_data["meta"] == data["meta"]
def test_parametrization_flag_when_enabled(tmp_dir, dvc, mocker): dvc.config["feature"]["parametrization"] = True mock = mocker.patch( "dvc.dvcfile.DataResolver.resolve", return_value=RESOLVED_DVC_YAML_DATA ) dvcfile = Dvcfile(dvc, "dvc.yaml") mocker.patch.object( dvcfile, "_load", return_value=[TEMPLATED_DVC_YAML_DATA, None] ) stages = list(dvcfile.stages) mock.assert_called_once() assert len(stages) == 2
def test_nested(self): # . # |-- a # | |__ nested # | |__ dir # | |__ error.dvc (stage.cwd == 'a/nested/dir') # |__ b # |__ nested.dvc (stage.out == 'a/nested') dir1 = "b" dir2 = "a" os.mkdir(dir1) os.mkdir(dir2) nested_dir = os.path.join(dir2, "nested") out_dir = relpath(nested_dir, dir1) nested_stage = self.dvc.run( fname=os.path.join(dir1, "b.dvc"), wdir=dir1, outs=[out_dir], # ../a/nested cmd=f"mkdir {out_dir}", single_stage=True, ) os.mkdir(os.path.join(nested_dir, "dir")) error_stage_path = os.path.join(nested_dir, "dir", "error.dvc") output = os.path.join("..", "..", "something") stage_dump = { "cmd": f"echo something > {output}", "outs": [{ "path": output }], } dump_yaml(error_stage_path, stage_dump) # NOTE: os.walk() walks in a sorted order and we need dir2 subdirs to # be processed before dir1 to load error.dvc first. self.dvc.index = self.dvc.index.update([ nested_stage, Dvcfile(self.dvc, error_stage_path).stage, ]) with patch.object(self.dvc, "_reset"): # to prevent `stages` resetting with self.assertRaises(StagePathAsOutputError): self.dvc.reproduce(error_stage_path)
def modify(repo, path, delete=False): outs = repo.find_outs_by_path(path) assert len(outs) == 1 out = outs[0] if out.scheme != "local": msg = "output '{}' scheme '{}' is not supported for metrics" raise DvcException(msg.format(out.path, out.path_info.scheme)) if delete: out.metric = None out.verify_metric() dvcfile = Dvcfile(repo, out.stage.path) dvcfile.dump(out.stage)
def test_run_multi_stage_repeat(tmp_dir, dvc, run_copy): from dvc.dvcfile import PIPELINE_FILE, Dvcfile from dvc.stage import PipelineStage tmp_dir.dvc_gen("foo", "foo") run_copy("foo", "foo1", name="copy-foo-foo1") run_copy("foo1", "foo2", name="copy-foo1-foo2") run_copy("foo2", "foo3", single_stage=True) stages = list(Dvcfile(dvc, PIPELINE_FILE).stages.values()) assert len(stages) == 2 assert all(isinstance(stage, PipelineStage) for stage in stages) assert {stage.name for stage in stages} == { "copy-foo-foo1", "copy-foo1-foo2", }
def test_ignored_in_checksum(self): stage = self.dvc.run( cmd="echo test > {}".format(self.FOO), deps=[self.BAR], outs=[self.FOO], ) d = stage.dumpd() self.assertNotIn(Stage.PARAM_WDIR, d.keys()) d = load_stage_file(stage.relpath) self.assertNotIn(Stage.PARAM_WDIR, d.keys()) with self.dvc.lock, self.dvc.state: stage = Dvcfile(self.dvc, stage.relpath).load() self.assertFalse(stage.changed())