Example #1
0
    def get_file_hash(self, path_info):
        hash_info = HashInfo(self.PARAM_CHECKSUM, file_md5(path_info)[0],)

        if hash_info:
            hash_info.size = os.path.getsize(path_info)

        return hash_info
Example #2
0
def test_get_hash_cached_dir(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {
            "foo": "foo",
            "bar": "bar",
            "subdir": {
                "data": "data"
            }
        }})
    tree = RepoTree(dvc)
    get_file_hash_spy = mocker.spy(tree, "get_file_hash")
    dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_dir_hash")
    with dvc.state:
        assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo(
            "md5",
            "8761c4e9acad696bee718615e23e22db.dir",
        )
    assert get_file_hash_spy.called
    assert not dvc_tree_spy.called
    get_file_hash_spy.reset_mock()

    shutil.rmtree(tmp_dir / "dir")
    with dvc.state:
        assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo(
            "md5",
            "8761c4e9acad696bee718615e23e22db.dir",
        )
    assert not get_file_hash_spy.called
    assert dvc_tree_spy.called
Example #3
0
def test_status_download_optimization(mocker, dvc):
    """When comparing the status to pull a remote cache,
    And the desired files to fetch are already on the local cache,
    Don't check the existence of the desired files on the remote cache
    """
    odb = LocalObjectDB(LocalFileSystem(), PathInfo("."))

    objs = {
        HashFile(None, odb.fs,
                 HashInfo("md5", "acbd18db4cc2f85cedef654fccc4a4d8")),
        HashFile(None, odb.fs,
                 HashInfo("md5", "37b51d194a7513e45b56f6524f2d51f2")),
    }

    local_exists = [obj.hash_info.value for obj in objs]
    mocker.patch.object(odb, "hashes_exist", return_value=local_exists)

    other_remote = mocker.Mock()
    other_remote.url = "other_remote"
    other_remote.hashes_exist.return_value = []
    other_remote.index = RemoteIndexNoop()

    other_remote.status(odb, objs, download=True)

    assert other_remote.hashes_exist.call_count == 0
def test_order(dvc):
    stage = create_stage(PipelineStage,
                         dvc,
                         deps=["input"],
                         outs=["output"],
                         params=["foo-param"],
                         **kwargs)
    params, deps = split_params_deps(stage)

    deps[0].hash_info = HashInfo("md5", "md-five")
    params[0].hash_info = HashInfo("params", {"foo-param": "value"})
    stage.outs[0].hash_info = HashInfo("md5", "md5-output")

    assert to_single_stage_lockfile(stage) == OrderedDict([
        ("cmd", "command"),
        ("deps", [{
            "path": "input",
            "md5": "md-five"
        }]),
        ("params", {
            "params.yaml": {
                "foo-param": "value"
            }
        }),
        ("outs", [{
            "path": "output",
            "md5": "md5-output"
        }]),
    ])
def test_used_objs(tmp_dir, scm, dvc, run_copy, rev):
    from dvc.hash_info import HashInfo

    dvc.config["core"]["autostage"] = True
    tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}}, "foo": "foo"})
    run_copy("foo", "bar", name="copy-foo-bar")
    scm.commit("commit")

    index = get_index(dvc, rev)

    expected_objs = [
        HashInfo(
            name="md5",
            value="acbd18db4cc2f85cedef654fccc4a4d8",
            obj_name="bar",
        ),
        HashInfo(
            name="md5",
            value="8c7dd922ad47494fc02c388e12c00eac",
            obj_name="dir/subdir/file",
        ),
        HashInfo(
            name="md5",
            value="d28c9e28591aeb7e303dc6772ffa6f6b.dir",
            obj_name="dir",
        ),
    ]

    assert index.used_objs() == {None: set(expected_objs)}
    assert index.used_objs("dir") == {None: set(expected_objs[1:])}
    assert index.used_objs(".", recursive=True) == {None: set(expected_objs)}
    assert index.used_objs("copy-foo-bar", with_deps=True) == {
        None: {expected_objs[0]}
    }
Example #6
0
    def get_hash(self, path_info, **kwargs):
        assert path_info and (isinstance(path_info, str)
                              or path_info.scheme == self.scheme)

        if not self.exists(path_info):
            return None

        # pylint: disable=assignment-from-none
        hash_ = self.state.get(path_info)

        # If we have dir hash in state db, but dir cache file is lost,
        # then we need to recollect the dir via .get_dir_hash() call below,
        # see https://github.com/iterative/dvc/issues/2219 for context
        if (hash_ and self.is_dir_hash(hash_) and not self.cache.tree.exists(
                self.cache.tree.hash_to_path_info(hash_))):
            hash_ = None

        if hash_:
            hash_info = HashInfo(self.PARAM_CHECKSUM, hash_)
            if hash_info.isdir:
                hash_info.dir_info = self.cache.get_dir_cache(hash_info)
            return hash_info

        if self.isdir(path_info):
            hash_info = self.get_dir_hash(path_info, **kwargs)
        else:
            hash_info = self.get_file_hash(path_info)

        if hash_info and self.exists(path_info):
            self.state.save(path_info, hash_info.value)

        return hash_info
Example #7
0
def test_get_hash_cached_granular(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {
            "foo": "foo",
            "bar": "bar",
            "subdir": {
                "data": "data"
            }
        }})
    fs = RepoFileSystem(dvc)
    subdir = PathInfo(tmp_dir) / "dir" / "subdir"
    assert fs.info(subdir).get("md5") is None
    assert stage(dvc.odb.local, subdir, fs, "md5").hash_info == HashInfo(
        "md5",
        "af314506f1622d107e0ed3f14ec1a3b5.dir",
    )
    assert fs.info(subdir / "data").get("md5") is None
    assert stage(dvc.odb.local, subdir / "data", fs,
                 "md5").hash_info == HashInfo(
                     "md5",
                     "8d777f385d3dfec8815d20f7496026dc",
                 )
    (tmp_dir / "dir" / "subdir" / "data").unlink()
    assert (fs.info(subdir /
                    "data")["md5"] == "8d777f385d3dfec8815d20f7496026dc")
Example #8
0
def test_get_hash_cached_dir(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {
            "foo": "foo",
            "bar": "bar",
            "subdir": {
                "data": "data"
            }
        }})
    fs = RepoFileSystem(dvc)
    expected = "8761c4e9acad696bee718615e23e22db.dir"
    assert fs.info(PathInfo(tmp_dir) / "dir").get("md5") is None
    assert stage(
        dvc.odb.local,
        PathInfo(tmp_dir) / "dir",
        fs,
        "md5",
    ).hash_info == HashInfo(
        "md5",
        "8761c4e9acad696bee718615e23e22db.dir",
    )

    shutil.rmtree(tmp_dir / "dir")
    assert fs.info(PathInfo(tmp_dir) / "dir")["md5"] == expected
    assert stage(
        dvc.odb.local,
        PathInfo(tmp_dir) / "dir",
        fs,
        "md5",
    ).hash_info == HashInfo(
        "md5",
        "8761c4e9acad696bee718615e23e22db.dir",
    )
Example #9
0
 def get_file_hash(self, path_info):
     outs = self._find_outs(path_info, strict=False)
     if len(outs) != 1:
         raise OutputNotFoundError
     out = outs[0]
     if out.is_dir_checksum:
         return HashInfo(
             out.tree.PARAM_CHECKSUM,
             self._get_granular_checksum(path_info, out),
         )
     return HashInfo(out.tree.PARAM_CHECKSUM, out.checksum)
Example #10
0
    def get_file_hash(self, path_info):
        if path_info.scheme != self.scheme:
            raise NotImplementedError

        with self.ssh(path_info) as ssh:
            hash_info = HashInfo(self.PARAM_CHECKSUM, ssh.md5(path_info.path),)

            if hash_info:
                hash_info.size = ssh.getsize(path_info.path)

            return hash_info
Example #11
0
def test_fill_from_lock_deps_outs(dvc, lock_data):
    stage = create_stage(
        PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"]
    )

    for item in chain(stage.deps, stage.outs):
        assert not item.hash_info

    StageLoader.fill_from_lock(stage, lock_data)

    assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Example #12
0
def test_load_stage_with_params(dvc, stage_data, lock_data):
    lock_data["params"] = {"params.yaml": {"lorem": "ipsum"}}
    stage_data["params"] = ["lorem"]
    dvcfile = Dvcfile(dvc, PIPELINE_FILE)
    stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data)

    params, deps = split_params_deps(stage)
    assert deps[0].def_path == "foo" and stage.outs[0].def_path == "bar"
    assert params[0].def_path == "params.yaml"
    assert params[0].hash_info == HashInfo("params", {"lorem": "ipsum"})
    assert deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Example #13
0
def test_load_stage(dvc, stage_data, lock_data):
    dvcfile = Dvcfile(dvc, PIPELINE_FILE)
    stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data)

    assert stage.wdir == os.path.abspath(os.curdir)
    assert stage.name == "stage-1"
    assert stage.cmd == "command"
    assert stage.path == os.path.abspath(PIPELINE_FILE)
    assert stage.deps[0].def_path == "foo"
    assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].def_path == "bar"
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Example #14
0
def test_fill_from_lock_use_appropriate_checksum(dvc, lock_data):
    stage = create_stage(
        PipelineStage,
        dvc,
        PIPELINE_FILE,
        deps=["s3://dvc-temp/foo"],
        outs=["bar"],
    )
    lock_data["deps"] = [{"path": "s3://dvc-temp/foo", "etag": "e-tag"}]
    StageLoader.fill_from_lock(stage, lock_data)
    assert stage.deps[0].hash_info == HashInfo("etag", "e-tag")
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
Example #15
0
def test_used_objs(tmp_dir, dvc, path):
    tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}})

    expected = {
        HashInfo("md5", "70922d6bf66eb073053a82f77d58c536.dir"),
        HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac"),
    }

    used = set()
    for _, obj_ids in dvc.used_objs([path]).items():
        used.update(obj_ids)

    assert used == expected
Example #16
0
def test_fill_from_lock_missing_checksums(dvc, lock_data):
    stage = create_stage(
        PipelineStage,
        dvc,
        PIPELINE_FILE,
        deps=["foo", "foo1"],
        outs=["bar", "bar1"],
    )

    StageLoader.fill_from_lock(stage, lock_data)

    assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
    assert not stage.deps[1].hash_info and not stage.outs[1].hash_info
Example #17
0
    def test(self):
        dir_hash = "123.dir"
        fname = os.fspath(
            self.dvc.cache.local.tree.hash_to_path_info(dir_hash))
        self.create(fname, "<clearly>not,json")
        with pytest.raises(DirCacheError):
            self.dvc.cache.local.load_dir_cache(HashInfo("md5", dir_hash))

        dir_hash = "234.dir"
        fname = os.fspath(
            self.dvc.cache.local.tree.hash_to_path_info(dir_hash))
        self.create(fname, '{"a": "b"}')
        self._do_test(
            self.dvc.cache.local.load_dir_cache(HashInfo("md5", dir_hash)))
Example #18
0
def test_get_hash_cached_granular(tmp_dir, dvc, mocker):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    tree = RepoTree(dvc)
    dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash")
    subdir = PathInfo(tmp_dir) / "dir" / "subdir"
    assert tree.get_hash(subdir) == HashInfo(
        "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir",
    )
    assert tree.get_hash(subdir / "data") == HashInfo(
        "md5", "8d777f385d3dfec8815d20f7496026dc",
    )
    assert dvc_tree_spy.called
Example #19
0
def test_cache_load_bad_dir_cache(tmp_dir, dvc):
    from dvc.data import load

    dir_hash = "123.dir"
    fname = os.fspath(dvc.odb.local.hash_to_path(dir_hash))
    tmp_dir.gen({fname: "<clearly>not,json"})
    with pytest.raises(ObjectFormatError):
        load(dvc.odb.local, HashInfo("md5", dir_hash))

    dir_hash = "234.dir"
    fname = os.fspath(dvc.odb.local.hash_to_path(dir_hash))
    tmp_dir.gen({fname: '{"a": "b"}'})
    with pytest.raises(ObjectFormatError):
        load(dvc.odb.local, HashInfo("md5", dir_hash))
Example #20
0
    def test(self):
        from dvc.objects import load

        dir_hash = "123.dir"
        fname = os.fspath(self.dvc.odb.local.hash_to_path(dir_hash))
        self.create(fname, "<clearly>not,json")
        with pytest.raises(ObjectFormatError):
            load(self.dvc.odb.local, HashInfo("md5", dir_hash))

        dir_hash = "234.dir"
        fname = os.fspath(self.dvc.odb.local.hash_to_path(dir_hash))
        self.create(fname, '{"a": "b"}')
        with pytest.raises(ObjectFormatError):
            load(self.dvc.odb.local, HashInfo("md5", dir_hash))
Example #21
0
def test_get_hash_granular(tmp_dir, dvc):
    tmp_dir.dvc_gen(
        {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}}
    )
    fs = DvcFileSystem(repo=dvc)
    subdir = "dir/subdir"
    assert fs.info(subdir).get("md5") is None
    _, _, obj = stage(dvc.odb.local, subdir, fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo(
        "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir"
    )
    data = posixpath.join(subdir, "data")
    assert fs.info(data)["md5"] == "8d777f385d3dfec8815d20f7496026dc"
    _, _, obj = stage(dvc.odb.local, data, fs, "md5", dry_run=True)
    assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc")
Example #22
0
def test_fill_from_lock_with_missing_sections(dvc, lock_data):
    stage = create_stage(
        PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"]
    )
    lock = deepcopy(lock_data)
    del lock["deps"]
    StageLoader.fill_from_lock(stage, lock)
    assert not stage.deps[0].hash_info
    assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")

    lock = deepcopy(lock_data)
    del lock["outs"]
    StageLoader.fill_from_lock(stage, lock)
    assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum")
    assert not stage.outs[0].hash_info
Example #23
0
def test_lock_outs_order(dvc, typ):
    stage = create_stage(PipelineStage, dvc, **{typ: ["input1", "input0"]},
                         **kwargs)
    stage.outs[0].hash_info = HashInfo("md5", "md-one1")
    stage.outs[1].hash_info = HashInfo("md5", "md-zer0")
    assert to_single_stage_lockfile(stage) == OrderedDict([
        ("cmd", "command"),
        (
            "outs",
            [
                OrderedDict([("path", "input0"), ("md5", "md-zer0")]),
                OrderedDict([("path", "input1"), ("md5", "md-one1")]),
            ],
        ),
    ])
Example #24
0
File: hdfs.py Project: yyqgood/dvc
    def get_file_hash(self, path_info):
        # NOTE: pyarrow doesn't support checksum, so we need to use hadoop
        regex = r".*\t.*\t(?P<checksum>.*)"
        stdout = self.hadoop_fs(f"checksum {path_info.url}",
                                user=path_info.user)
        hash_info = HashInfo(
            self.PARAM_CHECKSUM,
            self._group(regex, stdout, "checksum"),
        )

        with self.hdfs(path_info) as hdfs:
            file_info = hdfs.get_file_info(path_info.path)
            hash_info.size = file_info.size

        return hash_info
Example #25
0
def _get_file_hash(path_info, fs, name):
    info = fs.info(path_info)
    if name in info:
        assert not info[name].endswith(".dir")
        return HashInfo(name, info[name], size=info["size"])

    func = getattr(fs, name, None)
    if func:
        return func(path_info)

    if name == "md5":
        return HashInfo(
            name, file_md5(path_info, fs), size=fs.getsize(path_info)
        )

    raise NotImplementedError
Example #26
0
def _get_tree_obj(path_info, fs, name, odb, state, upload, **kwargs):
    from .tree import Tree

    value = fs.info(path_info).get(name)
    if value:
        hash_info = HashInfo(name, value)
        try:
            return Tree.load(odb, hash_info)
        except FileNotFoundError:
            pass

    tree = _build_tree(path_info, fs, name, odb, state, upload, **kwargs)

    odb.add(tree.path_info, tree.fs, tree.hash_info)
    if name != "md5":
        # NOTE: used only for external outputs. Initial reasoning was to be
        # able to validate .dir files right in the workspace (e.g. check s3
        # etag), but could be dropped for manual validation with regular md5,
        # that would be universal for all clouds.
        raw = odb.get(tree.hash_info)
        hash_info = get_file_hash(raw.path_info, raw.fs, name, state)
        tree.hash_info.name = hash_info.name
        tree.hash_info.value = hash_info.value
        if not tree.hash_info.value.endswith(".dir"):
            tree.hash_info.value += ".dir"
        odb.add(tree.path_info, tree.fs, tree.hash_info)

    return tree
Example #27
0
File: s3.py Project: jubaer145/dvc
 def get_file_hash(self, path_info):
     with self._get_obj(path_info) as obj:
         return HashInfo(
             self.PARAM_CHECKSUM,
             obj.e_tag.strip('"'),
             size=obj.content_length,
         )
Example #28
0
    def get(self, path_info, fs):
        """Gets the hash for the specified path info. Hash will be
        retrieved from the state database if available.

        Args:
            path_info (dict): path info to get the hash for.

        Returns:
            HashInfo or None: hash for the specified path info or None if it
            doesn't exist in the state database.
        """
        if not isinstance(fs, LocalFileSystem):
            return None

        assert isinstance(path_info, str) or path_info.scheme == "local"
        path = os.fspath(path_info)

        # NOTE: use os.path.exists instead of LocalFileSystem.exists
        # because it uses lexists() and will return True for broken
        # symlinks that we cannot stat() in get_mtime_and_size
        if not os.path.exists(path):
            return None

        mtime, size = get_mtime_and_size(path, self.fs)
        inode = get_inode(path)

        value = self.md5s.get(inode)

        if not value or value[0] != mtime or value[1] != size:
            return None

        return HashInfo("md5", value[2], size=int(size))
Example #29
0
    def fill_from_lock(stage, lock_data=None):
        """Fill values for params, checksums for outs and deps from lock."""
        if not lock_data:
            return

        assert isinstance(lock_data, dict)
        items = chain(
            ((StageParams.PARAM_DEPS, dep) for dep in stage.deps),
            ((StageParams.PARAM_OUTS, out) for out in stage.outs),
        )

        checksums = {
            key: {item["path"]: item
                  for item in lock_data.get(key, {})}
            for key in [StageParams.PARAM_DEPS, StageParams.PARAM_OUTS]
        }
        for key, item in items:
            path = item.def_path
            if isinstance(item, dependency.ParamsDependency):
                item.fill_values(get_in(lock_data, [stage.PARAM_PARAMS, path]))
                continue
            info = get_in(checksums, [key, path], {})
            info = info.copy()
            info.pop("path", None)
            item.hash_info = HashInfo.from_dict(info)
Example #30
0
def test_lock_deps(dvc):
    stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs)
    stage.deps[0].hash_info = HashInfo("md5", "md-five")
    assert to_single_stage_lockfile(stage) == OrderedDict([
        ("cmd", "command"),
        ("deps", [OrderedDict([("path", "input"), ("md5", "md-five")])]),
    ])