def test_frozen(dvc): stage = create_stage( PipelineStage, dvc, outs=["output"], deps=["input"], **kwargs ) assert stage.PARAM_FROZEN not in to_pipeline_file(stage)["something"] stage = create_stage(PipelineStage, dvc, **kwargs, frozen=True) assert to_pipeline_file(stage)["something"][stage.PARAM_FROZEN] is True
def test_always_changed(dvc): stage = create_stage(PipelineStage, dvc, outs=["output"], deps=["input"], **kwargs) assert (stage.PARAM_ALWAYS_CHANGED not in to_pipeline_file(stage)["something"]) stage = create_stage(PipelineStage, dvc, **kwargs, always_changed=True) assert (to_pipeline_file(stage)["something"][stage.PARAM_ALWAYS_CHANGED] is True)
def test_fill_from_lock_params(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"], params=[ "lorem", "lorem.ipsum", {"myparams.yaml": ["ipsum", "foobar"]}, ], ) lock_data["params"] = { "params.yaml": { "lorem": "lorem", "lorem.ipsum": ["i", "p", "s", "u", "m"], }, "myparams.yaml": { # missing value in lock for `foobar` params "ipsum": "ipsum" }, } params_deps = split_params_deps(stage)[0] assert set(params_deps[0].params) == {"lorem", "lorem.ipsum"} assert set(params_deps[1].params) == {"ipsum", "foobar"} assert not params_deps[0].hash_info assert not params_deps[1].hash_info StageLoader.fill_from_lock(stage, lock_data) assert params_deps[0].hash_info.value == lock_data["params"]["params.yaml"] assert ( params_deps[1].hash_info.value == lock_data["params"]["myparams.yaml"] )
def test_params_file_sorted(dvc): params = [ "lorem", "ipsum", { "custom.yaml": ["wxyz", "pqrs", "baz"] }, { "a-file-of-params.yaml": ["barr"] }, ] stage = create_stage(PipelineStage, dvc, outs=["bar"], deps=["foo"], params=params, **kwargs) assert to_pipeline_file(stage)["something"][stage.PARAM_PARAMS] == [ "ipsum", "lorem", { "a-file-of-params.yaml": ["barr"] }, { "custom.yaml": ["baz", "pqrs", "wxyz"] }, ]
def test_lock_outs(dvc, typ): stage = create_stage(PipelineStage, dvc, **{typ: ["input"]}, **kwargs) stage.outs[0].info = {"md5": "md-five"} assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ("outs", [OrderedDict([("path", "input"), ("md5", "md-five")])]), ])
def _create_stage(self, cache, wdir=None): from dvc.stage import create_stage, PipelineStage params = [] for param in cache.get("params", []): if isinstance(param, str): params.append(param) continue assert isinstance(param, dict) assert len(param) == 1 path = list(param.keys())[0] params_list = param[path] assert isinstance(params_list, list) params.append(f"{path}:" + ",".join(params_list)) stage = create_stage( PipelineStage, repo=self.repo, path="dvc.yaml", cmd=cache["cmd"], wdir=wdir, params=params, deps=[dep["path"] for dep in cache.get("deps", [])], outs=[out["path"] for out in cache["outs"]], ) StageLoader.fill_from_lock(stage, cache) return stage
def test_order(dvc): stage = create_stage(PipelineStage, dvc, deps=["input"], outs=["output"], params=["foo-param"], **kwargs) params, deps = split_params_deps(stage) deps[0].info = {"md5": "md-five"} params[0].info = {"foo-param": "value"} stage.outs[0].info = {"md5": "md5-output"} assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ("deps", [{ "path": "input", "md5": "md-five" }]), ("params", { "params.yaml": { "foo-param": "value" } }), ("outs", [{ "path": "output", "md5": "md5-output" }]), ])
def _create_stages(repo, targets, fname, pbar=None, external=False): from dvc.stage import Stage, create_stage stages = [] for out in Tqdm( targets, desc="Creating DVC-files", disable=len(targets) < LARGE_DIR_SIZE, unit="file", ): path, wdir, out = resolve_paths(repo, out) stage = create_stage( Stage, repo, fname or path, wdir=wdir, outs=[out], external=external, ) if stage: Dvcfile(repo, stage.path).remove_with_prompt(force=True) repo._reset() if not stage: if pbar is not None: pbar.total -= 1 continue stages.append(stage) if pbar is not None: pbar.update_msg(out) return stages
def run(self, fname=None, no_exec=False, **kwargs): from dvc.stage import PipelineStage, Stage, create_stage from dvc.dvcfile import Dvcfile, PIPELINE_FILE stage_cls = PipelineStage path = PIPELINE_FILE stage_name = kwargs.get("name") if not stage_name: kwargs.pop("name", None) stage_cls = Stage path = fname or _get_file_path(kwargs) else: if not is_valid_name(stage_name): raise InvalidStageName stage = create_stage(stage_cls, repo=self, path=path, **kwargs) if stage is None: return None dvcfile = Dvcfile(self, stage.path) if dvcfile.exists(): if stage_name and stage_name in dvcfile.stages: raise DuplicateStageName(stage_name, dvcfile) if stage_cls != PipelineStage: dvcfile.remove_with_prompt(force=kwargs.get("overwrite", True)) try: self.check_modified_graph([stage]) except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if not no_exec: stage.run(no_commit=kwargs.get("no_commit", False)) dvcfile.dump(stage, update_pipeline=True) return stage
def test_lock_deps(dvc): stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs) stage.deps[0].info = {"md5": "md-five"} assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ("deps", [OrderedDict([("path", "input"), ("md5", "md-five")])]), ])
def test_outs_and_outs_flags_are_sorted(dvc, typ, extra): stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs) stage.outs += output.loads_from(stage, ["barr"], use_cache=False, **extra) stage.outs += output.loads_from(stage, ["foobar"], use_cache=False, persist=True, **extra) stage.outs += output.loads_from(stage, ["foo"], persist=True, **extra) stage.outs += output.loads_from(stage, ["bar"], **extra) serialized_outs = to_pipeline_file(stage)["something"][typ] assert serialized_outs == [ "bar", { "barr": { "cache": False } }, { "foo": { "persist": True } }, { "foobar": { "cache": False, "persist": True } }, ] assert list(serialized_outs[3]["foobar"].keys()) == ["cache", "persist"]
def test_lock_params_file_sorted(dvc): stage = create_stage(PipelineStage, dvc, params=[ "lorem.ipsum", "abc", { "myparams.yaml": ["foo", "foobar"] }, { "a-params-file.yaml": ["bar", "barr"] }, ], **kwargs) stage.deps[0].info = {"lorem.ipsum": {"lorem1": 1, "lorem2": 2}, "abc": 3} stage.deps[1].info = {"foo": ["f", "o", "o"], "foobar": "foobar"} stage.deps[2].info = {"bar": ["b", "a", "r"], "barr": "barr"} assert to_single_stage_lockfile(stage)["params"] == OrderedDict([ ( DEFAULT_PARAMS_FILE, OrderedDict([("abc", 3), ("lorem.ipsum", { "lorem1": 1, "lorem2": 2 })]), ), ( "a-params-file.yaml", OrderedDict([("bar", ["b", "a", "r"]), ("barr", "barr")]), ), ( "myparams.yaml", OrderedDict([("foo", ["f", "o", "o"]), ("foobar", "foobar")]), ), ])
def imp_url( self, url, out=None, fname=None, erepo=None, frozen=True, no_exec=False, desc=None, jobs=None, ): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta out = resolve_output(url, out) path, wdir, out = resolve_paths(self, out) # NOTE: when user is importing something from within their own repository if ( erepo is None and os.path.exists(url) and path_isin(os.path.abspath(url), self.root_dir) ): url = relpath(url, wdir) stage = create_stage( Stage, self, fname or path, wdir=wdir, deps=[url], outs=[out], erepo=erepo, ) restore_meta(stage) if stage.can_be_skipped: return None if desc: stage.outs[0].desc = desc dvcfile = Dvcfile(self, stage.path) dvcfile.remove() try: self.check_modified_graph([stage]) except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if no_exec: stage.ignore_outs() else: stage.run(jobs=jobs) stage.frozen = frozen dvcfile.dump(stage) return stage
def run(self, fname=None, no_exec=False, single_stage=False, **kwargs): from dvc.stage import PipelineStage, Stage, create_stage from dvc.dvcfile import Dvcfile, PIPELINE_FILE if not kwargs.get("cmd"): raise InvalidArgumentError("command is not specified") stage_cls = PipelineStage path = PIPELINE_FILE stage_name = kwargs.get("name") if stage_name and single_stage: raise InvalidArgumentError( "`-n|--name` is incompatible with `--single-stage`") if not stage_name and not single_stage: raise InvalidArgumentError("`-n|--name` is required") if single_stage: kwargs.pop("name", None) stage_cls = Stage path = fname or _get_file_path(kwargs) else: if not is_valid_name(stage_name): raise InvalidStageName params = parse_params(kwargs.pop("params", [])) stage = create_stage(stage_cls, repo=self, path=path, params=params, **kwargs) if stage is None: return None dvcfile = Dvcfile(self, stage.path) if dvcfile.exists(): if kwargs.get("overwrite", True): dvcfile.remove_stage(stage) elif stage_cls != PipelineStage: raise StageFileAlreadyExistsError(dvcfile.relpath) elif stage_name and stage_name in dvcfile.stages: raise DuplicateStageName(stage_name, dvcfile) try: self.check_modified_graph([stage]) except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if no_exec: stage.ignore_outs() else: stage.run( no_commit=kwargs.get("no_commit", False), run_cache=kwargs.get("run_cache", True), ) dvcfile.dump(stage, update_pipeline=True, no_lock=no_exec) return stage
def test_fill_from_lock_empty_data(dvc): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"] ) StageLoader.fill_from_lock(stage, None) assert not stage.deps[0].hash_info and not stage.outs[0].hash_info StageLoader.fill_from_lock(stage, {}) assert not stage.deps[0].hash_info and not stage.outs[0].hash_info
def test_plot_props(dvc): props = {"x": "1"} stage = create_stage(PipelineStage, dvc, plots=["plot_file"], **kwargs) stage.outs[0].plot = props assert to_pipeline_file(stage)["something"][stage.PARAM_PLOTS] == [ {"plot_file": props} ]
def create( self, single_stage: bool = False, validate: bool = True, fname: str = None, force: bool = False, **stage_data, ) -> Union["Stage", "PipelineStage"]: """Creates a stage. Args: single_stage: if true, the .dvc file based stage is created, fname is required in that case fname: name of the file to use, not used for dvc.yaml files validate: if true, the new created stage is checked against the stages in the repo. Eg: graph correctness, potential overwrites in dvc.yaml file (unless `force=True`). force: ignores overwrites in dvc.yaml file stage_data: Stage data to create from (see create_stage and loads_from for more information) """ from dvc.stage import PipelineStage, Stage, create_stage, restore_meta from dvc.stage.exceptions import InvalidStageName from dvc.stage.utils import ( is_valid_name, prepare_file_path, validate_kwargs, ) stage_data = validate_kwargs(single_stage=single_stage, fname=fname, **stage_data) if single_stage: stage_cls = Stage path = fname or prepare_file_path(stage_data) else: path = PIPELINE_FILE stage_cls = PipelineStage stage_name = stage_data["name"] if not (stage_name and is_valid_name(stage_name)): raise InvalidStageName stage = create_stage(stage_cls, repo=self.repo, path=path, **stage_data) if validate: if not force: from dvc.stage.utils import check_stage_exists check_stage_exists(self.repo, stage, stage.path) new_index = self.repo.index.add(stage) new_index.check_graph() restore_meta(stage) return stage
def test_wdir(dvc): stage = create_stage(PipelineStage, dvc, **kwargs) assert stage.PARAM_WDIR not in to_pipeline_file(stage)["something"] stage.wdir = os.curdir assert stage.PARAM_WDIR not in to_pipeline_file(stage)["something"] stage.wdir = "some-dir" assert to_pipeline_file(stage)["something"][stage.PARAM_WDIR] == "some-dir"
def test_order_deps_outs(dvc, typ): all_types = ["deps", "params", "outs", "metrics", "plots"] all_types = [item for item in all_types if item != typ] extra = {key: [f"foo-{i}"] for i, key in enumerate(all_types)} stage = create_stage(PipelineStage, dvc, **kwargs, **extra) assert typ not in to_pipeline_file(stage)["something"] assert (list(to_pipeline_file(stage)["something"].keys()) == ["cmd"] + all_types)
def make(path="dvc.yaml", name="dummy_stage", **kwargs): from dvc.stage import PipelineStage, create_stage cmd = kwargs.get("cmd", "command") stage = create_stage( PipelineStage, dvc, path, name=name, cmd=cmd, **kwargs ) stage.dump() return stage
def test_deps_sorted(dvc): stage = create_stage( PipelineStage, dvc, deps=["a", "quick", "lazy", "fox"], **kwargs ) assert to_pipeline_file(stage)["something"][stage.PARAM_DEPS] == [ "a", "fox", "lazy", "quick", ]
def make(path="dvc.yaml", name="dummy_stage", **kwargs): from dvc.stage import PipelineStage, create_stage stage = create_stage(PipelineStage, dvc, path, name=name, cmd="", **kwargs) stage.dump() return stage
def _create_stages( repo, targets, fname, pbar=None, external=False, glob=False, desc=None, ): from glob import iglob from dvc.stage import Stage, create_stage if glob: expanded_targets = [ exp_target for target in targets for exp_target in iglob(target, recursive=True) ] else: expanded_targets = targets stages = [] for out in Tqdm( expanded_targets, desc="Creating DVC-files", disable=len(expanded_targets) < LARGE_DIR_SIZE, unit="file", ): path, wdir, out = resolve_paths(repo, out) stage = create_stage( Stage, repo, fname or path, wdir=wdir, outs=[out], external=external, ) if stage: Dvcfile(repo, stage.path).remove() if desc: stage.outs[0].desc = desc repo._reset() # pylint: disable=protected-access if not stage: if pbar is not None: pbar.total -= 1 continue stages.append(stage) if pbar is not None: pbar.update_msg(out) return stages
def _create_stages( repo, targets, fname, pbar=None, external=False, glob=False, desc=None, transfer=False, **kwargs, ): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta expanded_targets = glob_targets(targets, glob=glob) stages = [] for out in Tqdm( expanded_targets, desc="Creating DVC files", disable=len(expanded_targets) < LARGE_DIR_SIZE, unit="file", ): if kwargs.get("out"): out = resolve_output(out, kwargs["out"]) path, wdir, out = resolve_paths(repo, out, always_local=transfer and not kwargs.get("out")) stage = create_stage( Stage, repo, fname or path, wdir=wdir, outs=[out], external=external, ) restore_meta(stage) Dvcfile(repo, stage.path).remove() if desc: stage.outs[0].desc = desc repo._reset() # pylint: disable=protected-access if not stage: if pbar is not None: pbar.total -= 1 continue stages.append(stage) if pbar is not None: pbar.update_msg(out) return stages
def test_fill_from_lock_missing_params_section(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"], params=["lorem", "lorem.ipsum", {"myparams.yaml": ["ipsum"]}], ) params_deps = split_params_deps(stage)[0] StageLoader.fill_from_lock(stage, lock_data) assert not params_deps[0].hash_info and not params_deps[1].hash_info
def test_lock_params(dvc): stage = create_stage(PipelineStage, dvc, params=["lorem.ipsum", "abc"], **kwargs) stage.deps[0].info = {"lorem.ipsum": {"lorem1": 1, "lorem2": 2}, "abc": 3} assert to_single_stage_lockfile( stage)["params"][DEFAULT_PARAMS_FILE] == OrderedDict([("abc", 3), ("lorem.ipsum", { "lorem1": 1, "lorem2": 2 })])
def test_fill_from_lock_use_appropriate_checksum(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["s3://dvc-temp/foo"], outs=["bar"], ) lock_data["deps"] = [{"path": "s3://dvc-temp/foo", "etag": "e-tag"}] StageLoader.fill_from_lock(stage, lock_data) assert stage.deps[0].hash_info == HashInfo("etag", "e-tag") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_fill_from_lock_deps_outs(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"] ) for item in chain(stage.deps, stage.outs): assert not item.hash_info StageLoader.fill_from_lock(stage, lock_data) assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_unhashable(tmp_dir, dvc, mocker, kwargs): from dvc.stage import Stage, create_stage from dvc.stage.cache import RunCacheNotFoundError, StageCache cache = StageCache(dvc) stage = create_stage(Stage, path="stage.dvc", repo=dvc, **kwargs) get_stage_hash = mocker.patch("dvc.stage.cache._get_stage_hash") assert cache.save(stage) is None assert get_stage_hash.not_called with pytest.raises(RunCacheNotFoundError): cache.restore(stage) assert get_stage_hash.not_called
def test_dump_nondefault_hash(dvc): stage = create_stage(PipelineStage, dvc, deps=["s3://dvc-temp/file"], **kwargs) stage.deps[0].info = {"md5": "value"} assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ( "deps", [OrderedDict([("path", "s3://dvc-temp/file"), ("md5", "value")])], ), ])