def test_order(dvc):
    stage = create_stage(PipelineStage,
                         dvc,
                         deps=["input"],
                         outs=["output"],
                         params=["foo-param"],
                         **kwargs)
    params, deps = split_params_deps(stage)

    deps[0].hash_info = HashInfo("md5", "md-five")
    params[0].hash_info = HashInfo("params", {"foo-param": "value"})
    stage.outs[0].hash_info = HashInfo("md5", "md5-output")

    assert to_single_stage_lockfile(stage) == OrderedDict([
        ("cmd", "command"),
        ("deps", [{
            "path": "input",
            "md5": "md-five"
        }]),
        ("params", {
            "params.yaml": {
                "foo-param": "value"
            }
        }),
        ("outs", [{
            "path": "output",
            "md5": "md5-output"
        }]),
    ])
Beispiel #2
0
def test_get_hash_dirty_file(tmp_dir, dvc):
    from dvc_data import check
    from dvc_data.hashfile.hash import hash_file

    tmp_dir.dvc_gen("file", "file")
    file_hash_info = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac")

    (tmp_dir / "file").write_text("something")
    something_hash_info = HashInfo("md5", "437b930db84b8079c2dd804a71936b5f")

    # file is modified in workspace
    # hash_file(file) should return workspace hash, not DVC cached hash
    fs = DvcFileSystem(repo=dvc)
    assert fs.info("file").get("md5") is None
    staging, _, obj = stage(dvc.odb.local, "file", fs, "md5")
    assert obj.hash_info == something_hash_info
    check(staging, obj)

    # hash_file(file) should return DVC cached hash
    (tmp_dir / "file").unlink()
    assert fs.info("file")["md5"] == file_hash_info.value
    _, hash_info = hash_file("file", fs, "md5", state=dvc.state)
    assert hash_info == file_hash_info

    # tmp_dir/file can be staged even though it is missing in workspace since
    # repofs will use the DVC cached hash (and refer to the local cache object)
    _, _, obj = stage(dvc.odb.local, "file", fs, "md5")
    assert obj.hash_info == file_hash_info
Beispiel #3
0
def test_used_objs(tmp_dir, scm, dvc, run_copy, rev):
    from dvc_data.hashfile.hash_info import HashInfo

    dvc.config["core"]["autostage"] = True
    tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}}, "foo": "foo"})
    run_copy("foo", "bar", name="copy-foo-bar")
    scm.commit("commit")

    index = get_index(dvc, rev)

    expected_objs = [
        HashInfo(
            name="md5",
            value="acbd18db4cc2f85cedef654fccc4a4d8",
            obj_name="bar",
        ),
        HashInfo(
            name="md5",
            value="8c7dd922ad47494fc02c388e12c00eac",
            obj_name="dir/subdir/file",
        ),
        HashInfo(
            name="md5",
            value="d28c9e28591aeb7e303dc6772ffa6f6b.dir",
            obj_name="dir",
        ),
    ]

    assert index.used_objs() == {None: set(expected_objs)}
    assert index.used_objs("dir") == {None: set(expected_objs[1:])}
    assert index.used_objs(".", recursive=True) == {None: set(expected_objs)}
    assert index.used_objs("copy-foo-bar", with_deps=True) == {
        None: {expected_objs[0]}
    }
Beispiel #4
0
def test_fill_from_lock_use_appropriate_checksum(dvc, lock_data):
    stage = create_stage(
        PipelineStage,
        dvc,
        PIPELINE_FILE,
        deps=["s3://dvc-temp/foo"],
        outs=["bar"],
    )
    lock_data["deps"] = [{"path": "s3://dvc-temp/foo", "etag": "e-tag"}]
    StageLoader.fill_from_lock(stage, lock_data)
    assert stage.deps[0].hash_info == HashInfo("etag", "e-tag")
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Beispiel #5
0
def test_load_stage(dvc, stage_data, lock_data):
    dvcfile = Dvcfile(dvc, PIPELINE_FILE)
    stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data)

    assert stage.wdir == os.path.abspath(os.curdir)
    assert stage.name == "stage-1"
    assert stage.cmd == "command"
    assert stage.path == os.path.abspath(PIPELINE_FILE)
    assert stage.deps[0].def_path == "foo"
    assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].def_path == "bar"
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Beispiel #6
0
def test_load_stage_with_params(dvc, stage_data, lock_data):
    lock_data["params"] = {"params.yaml": {"lorem": "ipsum"}}
    stage_data["params"] = ["lorem"]
    dvcfile = Dvcfile(dvc, PIPELINE_FILE)
    stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data)

    params, deps = split_params_deps(stage)
    assert deps[0].def_path == "foo" and stage.outs[0].def_path == "bar"
    assert params[0].def_path == "params.yaml"
    assert params[0].hash_info == HashInfo("params", {"lorem": "ipsum"})
    assert deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Beispiel #7
0
def test_used_objs(tmp_dir, dvc, path):
    tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}})

    expected = {
        HashInfo("md5", "70922d6bf66eb073053a82f77d58c536.dir"),
        HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac"),
    }

    used = set()
    for _, obj_ids in dvc.used_objs([path]).items():
        used.update(obj_ids)

    assert used == expected
Beispiel #8
0
def test_as_raw():
    hash_info = HashInfo("md5", "a1d0c6e83f027327d8461063f4ac58a6.dir",
                         "objname")

    raw = hash_info.as_raw()

    assert hash_info.name == "md5"
    assert hash_info.value == "a1d0c6e83f027327d8461063f4ac58a6.dir"
    assert hash_info.obj_name == "objname"

    assert raw.name == "md5"
    assert raw.value == "a1d0c6e83f027327d8461063f4ac58a6"
    assert raw.obj_name == "objname"
Beispiel #9
0
def test_fill_from_lock_deps_outs(dvc, lock_data):
    stage = create_stage(PipelineStage,
                         dvc,
                         PIPELINE_FILE,
                         deps=["foo"],
                         outs=["bar"])

    for item in chain(stage.deps, stage.outs):
        assert not item.hash_info

    StageLoader.fill_from_lock(stage, lock_data)

    assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Beispiel #10
0
def test_fill_from_lock_missing_checksums(dvc, lock_data):
    stage = create_stage(
        PipelineStage,
        dvc,
        PIPELINE_FILE,
        deps=["foo", "foo1"],
        outs=["bar", "bar1"],
    )

    StageLoader.fill_from_lock(stage, lock_data)

    assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
    assert not stage.deps[1].hash_info and not stage.outs[1].hash_info
Beispiel #11
0
def test_cache_load_bad_dir_cache(tmp_dir, dvc):
    from dvc_data import load

    dir_hash = "123.dir"
    fname = os.fspath(dvc.odb.local.oid_to_path(dir_hash))
    tmp_dir.gen({fname: "<clearly>not,json"})
    with pytest.raises(ObjectFormatError):
        load(dvc.odb.local, HashInfo("md5", dir_hash))

    dir_hash = "234.dir"
    fname = os.fspath(dvc.odb.local.oid_to_path(dir_hash))
    tmp_dir.gen({fname: '{"a": "b"}'})
    with pytest.raises(ObjectFormatError):
        load(dvc.odb.local, HashInfo("md5", dir_hash))
def test_lock_outs_order(dvc, typ):
    stage = create_stage(PipelineStage, dvc, **{typ: ["input1", "input0"]},
                         **kwargs)
    stage.outs[0].hash_info = HashInfo("md5", "md-one1")
    stage.outs[1].hash_info = HashInfo("md5", "md-zer0")
    assert to_single_stage_lockfile(stage) == OrderedDict([
        ("cmd", "command"),
        (
            "outs",
            [
                OrderedDict([("path", "input0"), ("md5", "md-zer0")]),
                OrderedDict([("path", "input1"), ("md5", "md-one1")]),
            ],
        ),
    ])
Beispiel #13
0
def test_get_hash_granular(tmp_dir, dvc):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    fs = DataFileSystem(repo=dvc)
    subdir = "dir/subdir"
    assert fs.info(subdir).get("md5") is None
    _, _, obj = stage(dvc.odb.local, subdir, fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo(
        "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir"
    )
    data = posixpath.join(subdir, "data")
    assert fs.info(data)["md5"] == "8d777f385d3dfec8815d20f7496026dc"
    _, _, obj = stage(dvc.odb.local, data, fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc")
Beispiel #14
0
    def fill_from_lock(stage, lock_data=None):
        """Fill values for params, checksums for outs and deps from lock."""
        if not lock_data:
            return

        assert isinstance(lock_data, dict)
        items = chain(
            ((StageParams.PARAM_DEPS, dep) for dep in stage.deps),
            ((StageParams.PARAM_OUTS, out) for out in stage.outs),
        )

        checksums = {
            key: {item["path"]: item for item in lock_data.get(key, {})}
            for key in [StageParams.PARAM_DEPS, StageParams.PARAM_OUTS]
        }
        for key, item in items:
            path = item.def_path
            if isinstance(item, dependency.ParamsDependency):
                item.fill_values(get_in(lock_data, [stage.PARAM_PARAMS, path]))
                continue
            info = get_in(checksums, [key, path], {})
            info = info.copy()
            info.pop("path", None)
            item.meta = Meta.from_dict(info)
            item.hash_info = HashInfo.from_dict(info)
Beispiel #15
0
def test_load_stage_with_metrics_and_plots(dvc, stage_data, lock_data, typ):
    stage_data[typ] = stage_data.pop("outs")
    dvcfile = Dvcfile(dvc, PIPELINE_FILE)
    stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data)

    assert stage.outs[0].def_path == "bar"
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_lock_deps(dvc):
    stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs)
    stage.deps[0].hash_info = HashInfo("md5", "md-five")
    assert to_single_stage_lockfile(stage) == OrderedDict([
        ("cmd", "command"),
        ("deps", [OrderedDict([("path", "input"), ("md5", "md-five")])]),
    ])
def test_lock_params_file_sorted(dvc):
    stage = create_stage(PipelineStage,
                         dvc,
                         params=[
                             "lorem.ipsum",
                             "abc",
                             {
                                 "myparams.yaml": ["foo", "foobar"]
                             },
                             {
                                 "a-params-file.yaml": ["bar", "barr"]
                             },
                         ],
                         **kwargs)
    stage.deps[0].hash_info = HashInfo("params", {
        "lorem.ipsum": {
            "lorem1": 1,
            "lorem2": 2
        },
        "abc": 3
    })
    stage.deps[1].hash_info = HashInfo("params", {
        "foo": ["f", "o", "o"],
        "foobar": "foobar"
    })
    stage.deps[2].hash_info = HashInfo("params", {
        "bar": ["b", "a", "r"],
        "barr": "barr"
    })
    assert to_single_stage_lockfile(stage)["params"] == OrderedDict([
        (
            DEFAULT_PARAMS_FILE,
            OrderedDict([("abc", 3), ("lorem.ipsum", {
                "lorem1": 1,
                "lorem2": 2
            })]),
        ),
        (
            "a-params-file.yaml",
            OrderedDict([("bar", ["b", "a", "r"]), ("barr", "barr")]),
        ),
        (
            "myparams.yaml",
            OrderedDict([("foo", ["f", "o", "o"]), ("foobar", "foobar")]),
        ),
    ])
Beispiel #18
0
def test_fill_from_lock_with_missing_sections(dvc, lock_data):
    stage = create_stage(PipelineStage,
                         dvc,
                         PIPELINE_FILE,
                         deps=["foo"],
                         outs=["bar"])
    lock = deepcopy(lock_data)
    del lock["deps"]
    StageLoader.fill_from_lock(stage, lock)
    assert not stage.deps[0].hash_info
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")

    lock = deepcopy(lock_data)
    del lock["outs"]
    StageLoader.fill_from_lock(stage, lock)
    assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert not stage.outs[0].hash_info
Beispiel #19
0
def test_get_hash_dirty_file(tmp_dir, dvc):
    tmp_dir.dvc_gen("file", "file")
    (tmp_dir / "file").write_text("something")

    fs = DataFileSystem(repo=dvc)
    expected = "8c7dd922ad47494fc02c388e12c00eac"
    assert fs.info("file").get("md5") == expected
    _, _, obj = stage(dvc.odb.local, "file", fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", expected)
Beispiel #20
0
def test_get_hash_dirty_dir(tmp_dir, dvc):
    tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}})
    (tmp_dir / "dir" / "baz").write_text("baz")

    fs = DataFileSystem(repo=dvc)
    expected = "5ea40360f5b4ec688df672a4db9c17d1.dir"
    assert fs.info("dir").get("md5") == expected
    _, _, obj = stage(dvc.odb.local, "dir", fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", expected)
Beispiel #21
0
def test_get_hash_cached_file(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen({"foo": "foo"})
    fs = DvcFileSystem(repo=dvc)
    expected = "acbd18db4cc2f85cedef654fccc4a4d8"
    assert fs.info("foo").get("md5") is None
    _, _, obj = stage(dvc.odb.local, "foo", fs, "md5")
    assert obj.hash_info == HashInfo("md5", expected)
    (tmp_dir / "foo").unlink()
    assert fs.info("foo")["md5"] == expected
Beispiel #22
0
def test_get_hash_cached_dir(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    fs = DvcFileSystem(repo=dvc)
    expected = "8761c4e9acad696bee718615e23e22db.dir"
    assert fs.info("dir").get("md5") is None
    _, _, obj = stage(dvc.odb.local, "dir", fs, "md5")
    assert obj.hash_info == HashInfo(
        "md5", "8761c4e9acad696bee718615e23e22db.dir"
    )

    shutil.rmtree(tmp_dir / "dir")
    assert fs.info("dir")["md5"] == expected
    _, _, obj = stage(dvc.odb.local, "dir", fs, "md5")
    assert obj.hash_info == HashInfo(
        "md5", "8761c4e9acad696bee718615e23e22db.dir"
    )
Beispiel #23
0
    def __init__(
        self,
        stage,
        path,
        info=None,
        cache=True,
        metric=False,
        plot=False,
        persist=False,
        checkpoint=False,
        live=False,
        desc=None,
        remote=None,
        repo=None,
    ):
        self.repo = stage.repo if not repo and stage else repo
        fs_cls, fs_config, fs_path = get_cloud_fs(self.repo, url=path)
        self.fs = fs_cls(**fs_config)

        if (self.fs.protocol == "local" and stage
                and isinstance(stage.repo.fs, LocalFileSystem)
                and path_isin(path, stage.repo.root_dir)):
            self.def_path = relpath(path, stage.wdir)
            self.fs = stage.repo.fs
        else:
            self.def_path = path

        if (self.repo and self.fs.protocol == "local"
                and not self.fs.path.isabs(self.def_path)):
            self.fs = self.repo.fs

        self._validate_output_path(path, stage)
        # This output (and dependency) objects have too many paths/urls
        # here is a list and comments:
        #
        #   .def_path - path from definition in DVC file
        #   .fspath - local only, resolved
        #   .__str__ - for presentation purposes, def_path/relpath
        #
        # By resolved path, which contains actual location,
        # should be absolute and don't contain remote:// refs.
        self.stage = stage
        self.meta = Meta.from_dict(info)
        self.hash_info = HashInfo.from_dict(info)
        self.use_cache = False if self.IS_DEPENDENCY else cache
        self.metric = False if self.IS_DEPENDENCY else metric
        self.plot = False if self.IS_DEPENDENCY else plot
        self.persist = persist
        self.checkpoint = checkpoint
        self.live = live
        self.desc = desc

        self.fs_path = self._parse_path(self.fs, fs_path)
        self.obj = None

        self.remote = remote
Beispiel #24
0
def test_state(tmp_dir, dvc):
    tmp_dir.gen("foo", "foo content")
    path = tmp_dir / "foo"
    hash_info = HashInfo("md5", file_md5(path, dvc.fs))

    state = State(dvc.root_dir, dvc.tmp_dir, dvc.dvcignore)

    state.save(path, dvc.fs, hash_info)
    assert state.get(path, dvc.fs)[1] == hash_info

    path.unlink()
    path.write_text("1")

    assert state.get(path, dvc.fs) == (None, None)

    hash_info = HashInfo("md5", file_md5(path, dvc.fs))
    state.save(path, dvc.fs, hash_info)

    assert state.get(path, dvc.fs)[1] == hash_info
Beispiel #25
0
def test_get_hash_cached_granular(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    fs = DvcFileSystem(repo=dvc)
    subdir = "dir/subdir"
    assert fs.info(subdir).get("md5") is None
    _, _, obj = stage(dvc.odb.local, subdir, fs, "md5")
    assert obj.hash_info == HashInfo(
        "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir"
    )
    assert fs.info(posixpath.join(subdir, "data")).get("md5") is None
    _, _, obj = stage(dvc.odb.local, posixpath.join(subdir, "data"), fs, "md5")
    assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc")
    (tmp_dir / "dir" / "subdir" / "data").unlink()
    assert (
        fs.info(posixpath.join(subdir, "data"))["md5"]
        == "8d777f385d3dfec8815d20f7496026dc"
    )
Beispiel #26
0
def test_get_hash_dirty_dir(tmp_dir, dvc):
    tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}})
    (tmp_dir / "dir" / "baz").write_text("baz")

    fs = DvcFileSystem(repo=dvc)
    _, meta, obj = stage(dvc.odb.local, "dir", fs, "md5")
    assert obj.hash_info == HashInfo(
        "md5", "ba75a2162ca9c29acecb7957105a0bc2.dir"
    )
    assert meta.nfiles == 3
Beispiel #27
0
    def get_hash(self):
        info = self.read_params()

        missing_params = set(self.params) - set(info.keys())
        if missing_params:
            raise MissingParamsError(
                "Parameters '{}' are missing from '{}'.".format(
                    ", ".join(missing_params), self))

        return HashInfo(self.PARAM_PARAMS, info)
Beispiel #28
0
def test_status_download_optimization(mocker, dvc):
    """When comparing the status to pull a remote cache,
    And the desired files to fetch are already on the local cache,
    Don't check the existence of the desired files on the remote cache
    """
    from dvc_data.status import compare_status

    odb = LocalHashFileDB(LocalFileSystem(), os.getcwd())
    obj_ids = {
        HashInfo("md5", "acbd18db4cc2f85cedef654fccc4a4d8"),
        HashInfo("md5", "37b51d194a7513e45b56f6524f2d51f2"),
    }

    local_exists = [hash_info.value for hash_info in obj_ids]
    mocker.patch.object(odb, "oids_exist", return_value=local_exists)

    src_odb = mocker.Mock()

    compare_status(src_odb, odb, obj_ids, check_deleted=False)
    assert src_odb.oids_exist.call_count == 0
Beispiel #29
0
def v1_repo_lock(tmp_dir, dvc):
    """Generates a repo having v1 format lockfile"""
    size = 5 if os.name == "nt" else 4
    hi = HashInfo(name="md5", value="c157a79031e1c40f85931829bc5fc552")
    v1_lockdata = {
        "foo": {
            "cmd": "echo foo"
        },
        "bar": {
            "cmd": "echo bar>bar.txt",
            "outs": [{
                "path": "bar.txt",
                **hi.to_dict(), "size": size
            }],
        },
    }
    dvc.run(cmd="echo foo", name="foo", no_exec=True)
    dvc.run(cmd="echo bar>bar.txt", outs=["bar.txt"], name="bar", no_exec=True)
    (tmp_dir / "dvc.lock").dump(v1_lockdata)
    yield v1_lockdata
Beispiel #30
0
    def fill_values(self, values=None):
        """Load params values dynamically."""
        if values is None:
            return

        info = {}
        if not self.params:
            info.update(values)
        for param in self.params:
            if param in values:
                info[param] = values[param]
        self.hash_info = HashInfo(self.PARAM_PARAMS, info)