def test_order(dvc): stage = create_stage(PipelineStage, dvc, deps=["input"], outs=["output"], params=["foo-param"], **kwargs) params, deps = split_params_deps(stage) deps[0].hash_info = HashInfo("md5", "md-five") params[0].hash_info = HashInfo("params", {"foo-param": "value"}) stage.outs[0].hash_info = HashInfo("md5", "md5-output") assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ("deps", [{ "path": "input", "md5": "md-five" }]), ("params", { "params.yaml": { "foo-param": "value" } }), ("outs", [{ "path": "output", "md5": "md5-output" }]), ])
def test_get_hash_dirty_file(tmp_dir, dvc): from dvc_data import check from dvc_data.hashfile.hash import hash_file tmp_dir.dvc_gen("file", "file") file_hash_info = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac") (tmp_dir / "file").write_text("something") something_hash_info = HashInfo("md5", "437b930db84b8079c2dd804a71936b5f") # file is modified in workspace # hash_file(file) should return workspace hash, not DVC cached hash fs = DvcFileSystem(repo=dvc) assert fs.info("file").get("md5") is None staging, _, obj = stage(dvc.odb.local, "file", fs, "md5") assert obj.hash_info == something_hash_info check(staging, obj) # hash_file(file) should return DVC cached hash (tmp_dir / "file").unlink() assert fs.info("file")["md5"] == file_hash_info.value _, hash_info = hash_file("file", fs, "md5", state=dvc.state) assert hash_info == file_hash_info # tmp_dir/file can be staged even though it is missing in workspace since # repofs will use the DVC cached hash (and refer to the local cache object) _, _, obj = stage(dvc.odb.local, "file", fs, "md5") assert obj.hash_info == file_hash_info
def test_used_objs(tmp_dir, scm, dvc, run_copy, rev): from dvc_data.hashfile.hash_info import HashInfo dvc.config["core"]["autostage"] = True tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}}, "foo": "foo"}) run_copy("foo", "bar", name="copy-foo-bar") scm.commit("commit") index = get_index(dvc, rev) expected_objs = [ HashInfo( name="md5", value="acbd18db4cc2f85cedef654fccc4a4d8", obj_name="bar", ), HashInfo( name="md5", value="8c7dd922ad47494fc02c388e12c00eac", obj_name="dir/subdir/file", ), HashInfo( name="md5", value="d28c9e28591aeb7e303dc6772ffa6f6b.dir", obj_name="dir", ), ] assert index.used_objs() == {None: set(expected_objs)} assert index.used_objs("dir") == {None: set(expected_objs[1:])} assert index.used_objs(".", recursive=True) == {None: set(expected_objs)} assert index.used_objs("copy-foo-bar", with_deps=True) == { None: {expected_objs[0]} }
def test_fill_from_lock_use_appropriate_checksum(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["s3://dvc-temp/foo"], outs=["bar"], ) lock_data["deps"] = [{"path": "s3://dvc-temp/foo", "etag": "e-tag"}] StageLoader.fill_from_lock(stage, lock_data) assert stage.deps[0].hash_info == HashInfo("etag", "e-tag") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_load_stage(dvc, stage_data, lock_data): dvcfile = Dvcfile(dvc, PIPELINE_FILE) stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data) assert stage.wdir == os.path.abspath(os.curdir) assert stage.name == "stage-1" assert stage.cmd == "command" assert stage.path == os.path.abspath(PIPELINE_FILE) assert stage.deps[0].def_path == "foo" assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].def_path == "bar" assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_load_stage_with_params(dvc, stage_data, lock_data): lock_data["params"] = {"params.yaml": {"lorem": "ipsum"}} stage_data["params"] = ["lorem"] dvcfile = Dvcfile(dvc, PIPELINE_FILE) stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data) params, deps = split_params_deps(stage) assert deps[0].def_path == "foo" and stage.outs[0].def_path == "bar" assert params[0].def_path == "params.yaml" assert params[0].hash_info == HashInfo("params", {"lorem": "ipsum"}) assert deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_used_objs(tmp_dir, dvc, path): tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}}) expected = { HashInfo("md5", "70922d6bf66eb073053a82f77d58c536.dir"), HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac"), } used = set() for _, obj_ids in dvc.used_objs([path]).items(): used.update(obj_ids) assert used == expected
def test_as_raw(): hash_info = HashInfo("md5", "a1d0c6e83f027327d8461063f4ac58a6.dir", "objname") raw = hash_info.as_raw() assert hash_info.name == "md5" assert hash_info.value == "a1d0c6e83f027327d8461063f4ac58a6.dir" assert hash_info.obj_name == "objname" assert raw.name == "md5" assert raw.value == "a1d0c6e83f027327d8461063f4ac58a6" assert raw.obj_name == "objname"
def test_fill_from_lock_deps_outs(dvc, lock_data): stage = create_stage(PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"]) for item in chain(stage.deps, stage.outs): assert not item.hash_info StageLoader.fill_from_lock(stage, lock_data) assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_fill_from_lock_missing_checksums(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["foo", "foo1"], outs=["bar", "bar1"], ) StageLoader.fill_from_lock(stage, lock_data) assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") assert not stage.deps[1].hash_info and not stage.outs[1].hash_info
def test_cache_load_bad_dir_cache(tmp_dir, dvc): from dvc_data import load dir_hash = "123.dir" fname = os.fspath(dvc.odb.local.oid_to_path(dir_hash)) tmp_dir.gen({fname: "<clearly>not,json"}) with pytest.raises(ObjectFormatError): load(dvc.odb.local, HashInfo("md5", dir_hash)) dir_hash = "234.dir" fname = os.fspath(dvc.odb.local.oid_to_path(dir_hash)) tmp_dir.gen({fname: '{"a": "b"}'}) with pytest.raises(ObjectFormatError): load(dvc.odb.local, HashInfo("md5", dir_hash))
def test_lock_outs_order(dvc, typ): stage = create_stage(PipelineStage, dvc, **{typ: ["input1", "input0"]}, **kwargs) stage.outs[0].hash_info = HashInfo("md5", "md-one1") stage.outs[1].hash_info = HashInfo("md5", "md-zer0") assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ( "outs", [ OrderedDict([("path", "input0"), ("md5", "md-zer0")]), OrderedDict([("path", "input1"), ("md5", "md-one1")]), ], ), ])
def test_get_hash_granular(tmp_dir, dvc): tmp_dir.dvc_gen( {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}} ) fs = DataFileSystem(repo=dvc) subdir = "dir/subdir" assert fs.info(subdir).get("md5") is None _, _, obj = stage(dvc.odb.local, subdir, fs, "md5", dry_run=True) assert obj.hash_info == HashInfo( "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir" ) data = posixpath.join(subdir, "data") assert fs.info(data)["md5"] == "8d777f385d3dfec8815d20f7496026dc" _, _, obj = stage(dvc.odb.local, data, fs, "md5", dry_run=True) assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc")
def fill_from_lock(stage, lock_data=None): """Fill values for params, checksums for outs and deps from lock.""" if not lock_data: return assert isinstance(lock_data, dict) items = chain( ((StageParams.PARAM_DEPS, dep) for dep in stage.deps), ((StageParams.PARAM_OUTS, out) for out in stage.outs), ) checksums = { key: {item["path"]: item for item in lock_data.get(key, {})} for key in [StageParams.PARAM_DEPS, StageParams.PARAM_OUTS] } for key, item in items: path = item.def_path if isinstance(item, dependency.ParamsDependency): item.fill_values(get_in(lock_data, [stage.PARAM_PARAMS, path])) continue info = get_in(checksums, [key, path], {}) info = info.copy() info.pop("path", None) item.meta = Meta.from_dict(info) item.hash_info = HashInfo.from_dict(info)
def test_load_stage_with_metrics_and_plots(dvc, stage_data, lock_data, typ): stage_data[typ] = stage_data.pop("outs") dvcfile = Dvcfile(dvc, PIPELINE_FILE) stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data) assert stage.outs[0].def_path == "bar" assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_lock_deps(dvc): stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs) stage.deps[0].hash_info = HashInfo("md5", "md-five") assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ("deps", [OrderedDict([("path", "input"), ("md5", "md-five")])]), ])
def test_lock_params_file_sorted(dvc): stage = create_stage(PipelineStage, dvc, params=[ "lorem.ipsum", "abc", { "myparams.yaml": ["foo", "foobar"] }, { "a-params-file.yaml": ["bar", "barr"] }, ], **kwargs) stage.deps[0].hash_info = HashInfo("params", { "lorem.ipsum": { "lorem1": 1, "lorem2": 2 }, "abc": 3 }) stage.deps[1].hash_info = HashInfo("params", { "foo": ["f", "o", "o"], "foobar": "foobar" }) stage.deps[2].hash_info = HashInfo("params", { "bar": ["b", "a", "r"], "barr": "barr" }) assert to_single_stage_lockfile(stage)["params"] == OrderedDict([ ( DEFAULT_PARAMS_FILE, OrderedDict([("abc", 3), ("lorem.ipsum", { "lorem1": 1, "lorem2": 2 })]), ), ( "a-params-file.yaml", OrderedDict([("bar", ["b", "a", "r"]), ("barr", "barr")]), ), ( "myparams.yaml", OrderedDict([("foo", ["f", "o", "o"]), ("foobar", "foobar")]), ), ])
def test_fill_from_lock_with_missing_sections(dvc, lock_data): stage = create_stage(PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"]) lock = deepcopy(lock_data) del lock["deps"] StageLoader.fill_from_lock(stage, lock) assert not stage.deps[0].hash_info assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") lock = deepcopy(lock_data) del lock["outs"] StageLoader.fill_from_lock(stage, lock) assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert not stage.outs[0].hash_info
def test_get_hash_dirty_file(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") fs = DataFileSystem(repo=dvc) expected = "8c7dd922ad47494fc02c388e12c00eac" assert fs.info("file").get("md5") == expected _, _, obj = stage(dvc.odb.local, "file", fs, "md5", dry_run=True) assert obj.hash_info == HashInfo("md5", expected)
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") fs = DataFileSystem(repo=dvc) expected = "5ea40360f5b4ec688df672a4db9c17d1.dir" assert fs.info("dir").get("md5") == expected _, _, obj = stage(dvc.odb.local, "dir", fs, "md5", dry_run=True) assert obj.hash_info == HashInfo("md5", expected)
def test_get_hash_cached_file(tmp_dir, dvc, mocker): tmp_dir.dvc_gen({"foo": "foo"}) fs = DvcFileSystem(repo=dvc) expected = "acbd18db4cc2f85cedef654fccc4a4d8" assert fs.info("foo").get("md5") is None _, _, obj = stage(dvc.odb.local, "foo", fs, "md5") assert obj.hash_info == HashInfo("md5", expected) (tmp_dir / "foo").unlink() assert fs.info("foo")["md5"] == expected
def test_get_hash_cached_dir(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}} ) fs = DvcFileSystem(repo=dvc) expected = "8761c4e9acad696bee718615e23e22db.dir" assert fs.info("dir").get("md5") is None _, _, obj = stage(dvc.odb.local, "dir", fs, "md5") assert obj.hash_info == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir" ) shutil.rmtree(tmp_dir / "dir") assert fs.info("dir")["md5"] == expected _, _, obj = stage(dvc.odb.local, "dir", fs, "md5") assert obj.hash_info == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir" )
def __init__( self, stage, path, info=None, cache=True, metric=False, plot=False, persist=False, checkpoint=False, live=False, desc=None, remote=None, repo=None, ): self.repo = stage.repo if not repo and stage else repo fs_cls, fs_config, fs_path = get_cloud_fs(self.repo, url=path) self.fs = fs_cls(**fs_config) if (self.fs.protocol == "local" and stage and isinstance(stage.repo.fs, LocalFileSystem) and path_isin(path, stage.repo.root_dir)): self.def_path = relpath(path, stage.wdir) self.fs = stage.repo.fs else: self.def_path = path if (self.repo and self.fs.protocol == "local" and not self.fs.path.isabs(self.def_path)): self.fs = self.repo.fs self._validate_output_path(path, stage) # This output (and dependency) objects have too many paths/urls # here is a list and comments: # # .def_path - path from definition in DVC file # .fspath - local only, resolved # .__str__ - for presentation purposes, def_path/relpath # # By resolved path, which contains actual location, # should be absolute and don't contain remote:// refs. self.stage = stage self.meta = Meta.from_dict(info) self.hash_info = HashInfo.from_dict(info) self.use_cache = False if self.IS_DEPENDENCY else cache self.metric = False if self.IS_DEPENDENCY else metric self.plot = False if self.IS_DEPENDENCY else plot self.persist = persist self.checkpoint = checkpoint self.live = live self.desc = desc self.fs_path = self._parse_path(self.fs, fs_path) self.obj = None self.remote = remote
def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state = State(dvc.root_dir, dvc.tmp_dir, dvc.dvcignore) state.save(path, dvc.fs, hash_info) assert state.get(path, dvc.fs)[1] == hash_info path.unlink() path.write_text("1") assert state.get(path, dvc.fs) == (None, None) hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state.save(path, dvc.fs, hash_info) assert state.get(path, dvc.fs)[1] == hash_info
def test_get_hash_cached_granular(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}} ) fs = DvcFileSystem(repo=dvc) subdir = "dir/subdir" assert fs.info(subdir).get("md5") is None _, _, obj = stage(dvc.odb.local, subdir, fs, "md5") assert obj.hash_info == HashInfo( "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir" ) assert fs.info(posixpath.join(subdir, "data")).get("md5") is None _, _, obj = stage(dvc.odb.local, posixpath.join(subdir, "data"), fs, "md5") assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc") (tmp_dir / "dir" / "subdir" / "data").unlink() assert ( fs.info(posixpath.join(subdir, "data"))["md5"] == "8d777f385d3dfec8815d20f7496026dc" )
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") fs = DvcFileSystem(repo=dvc) _, meta, obj = stage(dvc.odb.local, "dir", fs, "md5") assert obj.hash_info == HashInfo( "md5", "ba75a2162ca9c29acecb7957105a0bc2.dir" ) assert meta.nfiles == 3
def get_hash(self): info = self.read_params() missing_params = set(self.params) - set(info.keys()) if missing_params: raise MissingParamsError( "Parameters '{}' are missing from '{}'.".format( ", ".join(missing_params), self)) return HashInfo(self.PARAM_PARAMS, info)
def test_status_download_optimization(mocker, dvc): """When comparing the status to pull a remote cache, And the desired files to fetch are already on the local cache, Don't check the existence of the desired files on the remote cache """ from dvc_data.status import compare_status odb = LocalHashFileDB(LocalFileSystem(), os.getcwd()) obj_ids = { HashInfo("md5", "acbd18db4cc2f85cedef654fccc4a4d8"), HashInfo("md5", "37b51d194a7513e45b56f6524f2d51f2"), } local_exists = [hash_info.value for hash_info in obj_ids] mocker.patch.object(odb, "oids_exist", return_value=local_exists) src_odb = mocker.Mock() compare_status(src_odb, odb, obj_ids, check_deleted=False) assert src_odb.oids_exist.call_count == 0
def v1_repo_lock(tmp_dir, dvc): """Generates a repo having v1 format lockfile""" size = 5 if os.name == "nt" else 4 hi = HashInfo(name="md5", value="c157a79031e1c40f85931829bc5fc552") v1_lockdata = { "foo": { "cmd": "echo foo" }, "bar": { "cmd": "echo bar>bar.txt", "outs": [{ "path": "bar.txt", **hi.to_dict(), "size": size }], }, } dvc.run(cmd="echo foo", name="foo", no_exec=True) dvc.run(cmd="echo bar>bar.txt", outs=["bar.txt"], name="bar", no_exec=True) (tmp_dir / "dvc.lock").dump(v1_lockdata) yield v1_lockdata
def fill_values(self, values=None): """Load params values dynamically.""" if values is None: return info = {} if not self.params: info.update(values) for param in self.params: if param in values: info[param] = values[param] self.hash_info = HashInfo(self.PARAM_PARAMS, info)