Beispiel #1
0
def test_get_hash_dirty_file(tmp_dir, dvc):
    from dvc_data import check
    from dvc_data.hashfile.hash import hash_file

    tmp_dir.dvc_gen("file", "file")
    file_hash_info = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac")

    (tmp_dir / "file").write_text("something")
    something_hash_info = HashInfo("md5", "437b930db84b8079c2dd804a71936b5f")

    # file is modified in workspace
    # hash_file(file) should return workspace hash, not DVC cached hash
    fs = DvcFileSystem(repo=dvc)
    assert fs.info("file").get("md5") is None
    staging, _, obj = stage(dvc.odb.local, "file", fs, "md5")
    assert obj.hash_info == something_hash_info
    check(staging, obj)

    # hash_file(file) should return DVC cached hash
    (tmp_dir / "file").unlink()
    assert fs.info("file")["md5"] == file_hash_info.value
    _, hash_info = hash_file("file", fs, "md5", state=dvc.state)
    assert hash_info == file_hash_info

    # tmp_dir/file can be staged even though it is missing in workspace since
    # repofs will use the DVC cached hash (and refer to the local cache object)
    _, _, obj = stage(dvc.odb.local, "file", fs, "md5")
    assert obj.hash_info == file_hash_info
Beispiel #2
0
def test_get_hash_granular(tmp_dir, dvc):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    fs = DataFileSystem(repo=dvc)
    subdir = "dir/subdir"
    assert fs.info(subdir).get("md5") is None
    _, _, obj = stage(dvc.odb.local, subdir, fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo(
        "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir"
    )
    data = posixpath.join(subdir, "data")
    assert fs.info(data)["md5"] == "8d777f385d3dfec8815d20f7496026dc"
    _, _, obj = stage(dvc.odb.local, data, fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc")
Beispiel #3
0
def test_staging_file(tmp_dir, dvc):
    from dvc_data import check
    from dvc_data.stage import stage
    from dvc_data.transfer import transfer

    tmp_dir.gen("foo", "foo")
    fs = LocalFileSystem()

    local_odb = dvc.odb.local
    staging_odb, _, obj = stage(
        local_odb, (tmp_dir / "foo").fs_path, fs, "md5"
    )

    assert not local_odb.exists(obj.hash_info.value)
    assert staging_odb.exists(obj.hash_info.value)

    with pytest.raises(FileNotFoundError):
        check(local_odb, obj)
    check(staging_odb, obj)

    transfer(staging_odb, local_odb, {obj.hash_info}, hardlink=True)
    check(local_odb, obj)
    check(staging_odb, obj)

    path = local_odb.oid_to_path(obj.hash_info.value)
    assert fs.exists(path)
Beispiel #4
0
def test_get_hash_cached_dir(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    fs = DvcFileSystem(repo=dvc)
    expected = "8761c4e9acad696bee718615e23e22db.dir"
    assert fs.info("dir").get("md5") is None
    _, _, obj = stage(dvc.odb.local, "dir", fs, "md5")
    assert obj.hash_info == HashInfo(
        "md5", "8761c4e9acad696bee718615e23e22db.dir"
    )

    shutil.rmtree(tmp_dir / "dir")
    assert fs.info("dir")["md5"] == expected
    _, _, obj = stage(dvc.odb.local, "dir", fs, "md5")
    assert obj.hash_info == HashInfo(
        "md5", "8761c4e9acad696bee718615e23e22db.dir"
    )
Beispiel #5
0
def test_get_hash_cached_file(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen({"foo": "foo"})
    fs = DvcFileSystem(repo=dvc)
    expected = "acbd18db4cc2f85cedef654fccc4a4d8"
    assert fs.info("foo").get("md5") is None
    _, _, obj = stage(dvc.odb.local, "foo", fs, "md5")
    assert obj.hash_info == HashInfo("md5", expected)
    (tmp_dir / "foo").unlink()
    assert fs.info("foo")["md5"] == expected
Beispiel #6
0
def test_get_hash_dirty_dir(tmp_dir, dvc):
    tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}})
    (tmp_dir / "dir" / "baz").write_text("baz")

    fs = DataFileSystem(repo=dvc)
    expected = "5ea40360f5b4ec688df672a4db9c17d1.dir"
    assert fs.info("dir").get("md5") == expected
    _, _, obj = stage(dvc.odb.local, "dir", fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", expected)
Beispiel #7
0
def test_get_hash_dirty_file(tmp_dir, dvc):
    tmp_dir.dvc_gen("file", "file")
    (tmp_dir / "file").write_text("something")

    fs = DataFileSystem(repo=dvc)
    expected = "8c7dd922ad47494fc02c388e12c00eac"
    assert fs.info("file").get("md5") == expected
    _, _, obj = stage(dvc.odb.local, "file", fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", expected)
Beispiel #8
0
def test_get_hash_dirty_dir(tmp_dir, dvc):
    tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}})
    (tmp_dir / "dir" / "baz").write_text("baz")

    fs = DvcFileSystem(repo=dvc)
    _, meta, obj = stage(dvc.odb.local, "dir", fs, "md5")
    assert obj.hash_info == HashInfo(
        "md5", "ba75a2162ca9c29acecb7957105a0bc2.dir"
    )
    assert meta.nfiles == 3
Beispiel #9
0
def test_get_hash_cached_granular(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    fs = DvcFileSystem(repo=dvc)
    subdir = "dir/subdir"
    assert fs.info(subdir).get("md5") is None
    _, _, obj = stage(dvc.odb.local, subdir, fs, "md5")
    assert obj.hash_info == HashInfo(
        "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir"
    )
    assert fs.info(posixpath.join(subdir, "data")).get("md5") is None
    _, _, obj = stage(dvc.odb.local, posixpath.join(subdir, "data"), fs, "md5")
    assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc")
    (tmp_dir / "dir" / "subdir" / "data").unlink()
    assert (
        fs.info(posixpath.join(subdir, "data"))["md5"]
        == "8d777f385d3dfec8815d20f7496026dc"
    )
Beispiel #10
0
def test_stage_dir_optimization(
    tmp_dir, dvc, mocker, dry_run, expected_staging_contents
):
    from dvc_data import stage
    from dvc_data.objects.tree import Tree

    tmp_dir.dvc_gen(
        {
            "data": {
                "foo": "bar",
                "subdir": {"subfoo": "subbar"},
            }
        }
    )
    odb = dvc.odb.local

    objs = set(odb.all())
    clean_staging()

    tmp_dir.gen({"data": {"baz": "quz"}})

    stage_spy = mocker.spy(stage, "_stage_tree")
    _, _, tree = stage.stage(odb, "data", odb.fs, odb.fs.PARAM_CHECKSUM)

    assert stage_spy.called
    assert set(odb.all()) - objs == {tree.hash_info.as_raw().value}
    stage_spy.reset_mock()
    clean_staging()

    load_spy = mocker.spy(Tree, "load")
    build_tree_spy = mocker.spy(stage, "_build_tree")

    staging, _, tree = stage.stage(
        odb, "data", odb.fs, odb.fs.PARAM_CHECKSUM, dry_run=dry_run
    )
    assert not stage_spy.called
    assert not build_tree_spy.called

    load_args, _ = load_spy.call_args
    assert load_args[1].value == tree.hash_info.as_raw().value

    assert set(staging.all()) == expected_staging_contents
Beispiel #11
0
    def _get_used_and_obj(
        self,
        obj_only=False,
        **kwargs
    ) -> Tuple[Dict[Optional["ObjectDB"], Set["HashInfo"]], "HashFile"]:
        from dvc.config import NoRemoteError
        from dvc.exceptions import NoOutputOrStageError, PathMissingError
        from dvc.utils import as_posix
        from dvc_data.objects.tree import Tree, TreeError
        from dvc_data.stage import stage

        local_odb = self.repo.odb.local
        locked = kwargs.pop("locked", True)
        with self._make_repo(locked=locked,
                             cache_dir=local_odb.cache_dir) as repo:
            used_obj_ids = defaultdict(set)
            rev = repo.get_rev()
            if locked and self.def_repo.get(self.PARAM_REV_LOCK) is None:
                self.def_repo[self.PARAM_REV_LOCK] = rev

            if not obj_only:
                try:
                    for odb, obj_ids in repo.used_objs(
                        [os.path.join(repo.root_dir, self.def_path)],
                            force=True,
                            jobs=kwargs.get("jobs"),
                            recursive=True,
                    ).items():
                        if odb is None:
                            odb = repo.cloud.get_remote_odb()
                            odb.read_only = True
                        self._check_circular_import(odb, obj_ids)
                        used_obj_ids[odb].update(obj_ids)
                except (NoRemoteError, NoOutputOrStageError):
                    pass

            try:
                staging, _, staged_obj = stage(
                    local_odb,
                    as_posix(self.def_path),
                    repo.dvcfs,
                    local_odb.fs.PARAM_CHECKSUM,
                )
            except (FileNotFoundError, TreeError) as exc:
                raise PathMissingError(self.def_path,
                                       self.def_repo[self.PARAM_URL]) from exc
            staging = copy(staging)
            staging.read_only = True

            self._staged_objs[rev] = staged_obj
            used_obj_ids[staging].add(staged_obj.hash_info)
            if isinstance(staged_obj, Tree):
                used_obj_ids[staging].update(oid for _, _, oid in staged_obj)
            return used_obj_ids, staged_obj
Beispiel #12
0
def test_get_hash_mixed_dir(tmp_dir, scm, dvc):
    tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}})
    tmp_dir.dvc.add(os.path.join("dir", "foo"))
    tmp_dir.scm.add([
        os.path.join("dir", "bar"),
        os.path.join("dir", ".gitignore"),
        os.path.join("dir", "foo.dvc"),
    ])
    tmp_dir.scm.commit("add dir")
    clean_staging()

    fs = DvcFileSystem(repo=dvc)
    _, _, obj = stage(dvc.odb.local, "dir", fs, "md5")
    assert obj.hash_info == HashInfo("md5",
                                     "e1d9e8eae5374860ae025ec84cfd85c7.dir")
Beispiel #13
0
def test_dir_hash_should_be_key_order_agnostic(tmp_dir, dvc):
    from dvc_data.objects.tree import Tree
    from dvc_data.stage import stage

    tmp_dir.gen({"data": {"1": "1 content", "2": "2 content"}})

    path = (tmp_dir / "data").fs_path

    tree = Tree.from_list([{
        "relpath": "1",
        "md5": "1"
    }, {
        "relpath": "2",
        "md5": "2"
    }])
    tree.digest()
    with patch("dvc_data.stage._stage_tree", return_value=(None, tree)):
        _, _, obj = stage(dvc.odb.local, path, dvc.odb.local.fs, "md5")
        hash1 = obj.hash_info

    # remove the raw dir obj to force building the tree on the next stage call
    dvc.odb.local.fs.remove(dvc.odb.local.oid_to_path(hash1.as_raw().value))

    tree = Tree.from_list([{
        "md5": "1",
        "relpath": "1"
    }, {
        "md5": "2",
        "relpath": "2"
    }])
    tree.digest()
    with patch("dvc_data.stage._stage_tree", return_value=(None, tree)):
        _, _, obj = stage(dvc.odb.local, path, dvc.odb.local.fs, "md5")
        hash2 = obj.hash_info

    assert hash1 == hash2
Beispiel #14
0
def test_subrepos_are_ignored(tmp_dir, erepo_dir):
    subrepo = erepo_dir / "dir" / "subrepo"
    make_subrepo(subrepo, erepo_dir.scm)
    with erepo_dir.chdir():
        erepo_dir.dvc_gen("dir/foo", "foo", commit="foo")
        erepo_dir.scm_gen("dir/bar", "bar", commit="bar")

    with subrepo.chdir():
        subrepo.dvc_gen({"file": "file"}, commit="add files on subrepo")

    with external_repo(os.fspath(erepo_dir)) as repo:
        repo.dvcfs.get(
            "dir",
            os.fspath(tmp_dir / "out"),
        )
        expected_files = {"foo": "foo", "bar": "bar", ".gitignore": "/foo\n"}
        assert (tmp_dir / "out").read_text() == expected_files

        # clear cache to test saving to cache
        cache_dir = tmp_dir / repo.odb.local.cache_dir
        remove(cache_dir)
        clean_staging()
        makedirs(cache_dir)

        staging, _, obj = stage(
            repo.odb.local,
            "dir",
            repo.dvcfs,
            "md5",
            ignore=repo.dvcignore,
        )
        transfer(
            staging,
            repo.odb.local,
            {obj.hash_info},
            shallow=False,
            hardlink=True,
        )
        assert set(cache_dir.glob("??/*")) == {
            cache_dir / "e1" / "d9e8eae5374860ae025ec84cfd85c7",
            cache_dir / "e1" / "d9e8eae5374860ae025ec84cfd85c7.dir",
            cache_dir / "37" / "b51d194a7513e45b56f6524f2d51f2",
            cache_dir / "94" / "7d2b84e5aa88170e80dff467a5bfb6",
            cache_dir / "ac" / "bd18db4cc2f85cedef654fccc4a4d8",
        }