Example #1
0
def test_walk_fs_with_git(tmp_dir, scm):
    tmp_dir.gen({
        "foo": "foo",
        "bar": "bar",
        "тест": "проверка",
        "code.py": "import sys\nimport shutil\n"
        "shutil.copyfile(sys.argv[1], sys.argv[2])",
        "data_dir": {
            "data": "data",
            "data_sub_dir": {
                "data_sub": "data_sub"
            },
        },
    })
    fs = LocalFileSystem(url=str(tmp_dir))
    walk_result = []
    for root, dirs, files in fs.walk("."):
        dirs[:] = [i for i in dirs if i != ".git"]
        walk_result.append((root, dirs, files))

    assert convert_to_sets(walk_result) == [
        (".", {"data_dir"}, {"bar", "тест", "code.py", "foo"}),
        (join("data_dir"), {"data_sub_dir"}, {"data"}),
        (join("data_dir", "data_sub_dir"), set(), {"data_sub"}),
    ]

    walk_result = fs.walk(join("data_dir", "data_sub_dir"))
    assert convert_to_sets(walk_result) == [(join("data_dir", "data_sub_dir"),
                                             set(), {"data_sub"})]
Example #2
0
def test_walk_no_scm(tmp_dir):
    tmp_dir.gen({
        "foo": "foo",
        "bar": "bar",
        "тест": "проверка",
        "code.py": "import sys\nimport shutil\n"
        "shutil.copyfile(sys.argv[1], sys.argv[2])",
        "data_dir": {
            "data": "data",
            "data_sub_dir": {
                "data_sub": "data_sub"
            },
        },
    })
    fs = LocalFileSystem()
    walk_results = fs.walk(str(tmp_dir))
    assert convert_to_sets(walk_results) == [
        (str(tmp_dir), {"data_dir"}, {"code.py", "bar", "тест", "foo"}),
        (str(tmp_dir / "data_dir"), {"data_sub_dir"}, {"data"}),
        (str(tmp_dir / "data_dir" / "data_sub_dir"), set(), {"data_sub"}),
    ]

    walk_results = fs.walk(join("data_dir", "data_sub_dir"))
    assert convert_to_sets(walk_results) == [
        (join("data_dir", "data_sub_dir"), set(), {"data_sub"}),
    ]
Example #3
0
def test_staging_file(tmp_dir, dvc):
    from dvc.objects import check
    from dvc.objects.stage import stage
    from dvc.objects.transfer import transfer

    tmp_dir.gen("foo", "foo")
    fs = LocalFileSystem()

    local_odb = dvc.odb.local
    staging_odb, obj = stage(local_odb, tmp_dir / "foo", fs, "md5")

    assert not local_odb.exists(obj.hash_info)
    assert staging_odb.exists(obj.hash_info)

    with pytest.raises(FileNotFoundError):
        check(local_odb, obj)
    check(staging_odb, obj)

    transfer(staging_odb, local_odb, {obj.hash_info}, move=True)
    check(local_odb, obj)
    with pytest.raises(FileNotFoundError):
        check(staging_odb, obj)

    path_info = local_odb.hash_to_path_info(obj.hash_info.value)
    assert fs.exists(path_info)
Example #4
0
    def __init__(self, repo):
        from dvc.fs.local import LocalFileSystem

        super().__init__()

        self.repo = repo
        self.root_dir = repo.root_dir
        self.fs = LocalFileSystem(None, {"url": self.root_dir})

        state_config = repo.config.get("state", {})
        self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT)
        self.row_cleanup_quota = state_config.get(
            "row_cleanup_quota", self.STATE_ROW_CLEANUP_QUOTA
        )

        if not repo.tmp_dir:
            self.state_file = None
            return

        self.state_file = os.path.join(repo.tmp_dir, self.STATE_FILE)

        # https://www.sqlite.org/tempfiles.html
        self.temp_files = [
            self.state_file + "-journal",
            self.state_file + "-wal",
        ]

        self.database = None
        self.cursor = None
        self.inserts = 0
Example #5
0
def test_staging_dir(tmp_dir, dvc):
    from dvc.data import check
    from dvc.data.stage import stage
    from dvc.data.transfer import transfer

    tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}})
    fs = LocalFileSystem()
    local_odb = dvc.odb.local

    staging_odb, _, obj = stage(local_odb, (tmp_dir / "dir").fs_path, fs,
                                "md5")

    assert not local_odb.exists(obj.hash_info)
    assert staging_odb.exists(obj.hash_info)

    with pytest.raises(FileNotFoundError):
        check(local_odb, obj)
    check(staging_odb, obj)

    transfer(staging_odb,
             local_odb, {obj.hash_info},
             shallow=False,
             hardlink=True)
    check(local_odb, obj)
    check(staging_odb, obj)

    path = local_odb.hash_to_path(obj.hash_info.value)
    assert fs.exists(path)
Example #6
0
 def test_nobranch(self):
     fs = LocalFileSystem(None, {"url": self._root_dir}, use_dvcignore=True)
     self.assertWalkEqual(
         fs.walk("."),
         [
             (".", ["data_dir"], ["bar", "тест", "code.py", "foo"]),
             (join("data_dir"), ["data_sub_dir"], ["data"]),
             (join("data_dir", "data_sub_dir"), [], ["data_sub"]),
         ],
     )
     self.assertWalkEqual(
         fs.walk(join("data_dir", "data_sub_dir")),
         [(join("data_dir", "data_sub_dir"), [], ["data_sub"])],
     )
Example #7
0
    def __init__(self, root_dir=None, tmp_dir=None):
        from diskcache import Cache

        super().__init__()

        self.tmp_dir = tmp_dir
        self.root_dir = root_dir
        self.fs = LocalFileSystem(None, {"url": self.root_dir})

        if not tmp_dir:
            return

        config = {"eviction_policy": "least-recently-used"}
        self.links = Cache(directory=os.path.join(tmp_dir, "links"), **config)
        self.md5s = Cache(directory=os.path.join(tmp_dir, "md5s"), **config)
Example #8
0
def test_status_download_optimization(mocker, dvc):
    """When comparing the status to pull a remote cache,
    And the desired files to fetch are already on the local cache,
    Don't check the existence of the desired files on the remote cache
    """
    odb = LocalObjectDB(LocalFileSystem(), PathInfo("."))

    objs = {
        HashFile(None, odb.fs,
                 HashInfo("md5", "acbd18db4cc2f85cedef654fccc4a4d8")),
        HashFile(None, odb.fs,
                 HashInfo("md5", "37b51d194a7513e45b56f6524f2d51f2")),
    }

    local_exists = [obj.hash_info.value for obj in objs]
    mocker.patch.object(odb, "hashes_exist", return_value=local_exists)

    other_remote = mocker.Mock()
    other_remote.url = "other_remote"
    other_remote.hashes_exist.return_value = []
    other_remote.index = RemoteIndexNoop()

    other_remote.status(odb, objs, download=True)

    assert other_remote.hashes_exist.call_count == 0
Example #9
0
    def __init__(
        self,
        dvc_dir=None,
        validate=True,
        fs=None,
        config=None,
    ):  # pylint: disable=super-init-not-called
        from dvc.fs.local import LocalFileSystem

        self.dvc_dir = dvc_dir

        if not dvc_dir:
            try:
                from dvc.repo import Repo

                self.dvc_dir = os.path.join(Repo.find_dvc_dir())
            except NotDvcRepoError:
                self.dvc_dir = None
        else:
            self.dvc_dir = os.path.abspath(os.path.realpath(dvc_dir))

        self.wfs = LocalFileSystem(None, {"url": self.dvc_dir})
        self.fs = fs or self.wfs

        self.load(validate=validate, config=config)
Example #10
0
def test_track_from_multiple_files(tmp_dir):
    d1 = {"Train": {"us": {"lr": 10}}}
    d2 = {"Train": {"us": {"layers": 100}}}

    fs = LocalFileSystem()
    path1 = tmp_dir / "params.yaml"
    path2 = tmp_dir / "params2.yaml"
    path1.dump(d1, fs=fs)
    path2.dump(d2, fs=fs)

    context = Context.load_from(fs, path1)
    c = Context.load_from(fs, path2)
    context.merge_update(c)

    def key_tracked(d, path, key):
        return key in d[relpath(path)]

    with context.track() as tracked:
        context.select("Train")
        assert not (
            key_tracked(tracked, path1, "Train")
            or key_tracked(tracked, path2, "Train")
        )

        context.select("Train.us")
        assert not (
            key_tracked(tracked, path1, "Train.us")
            or key_tracked(tracked, path2, "Train.us")
        )

        context.select("Train.us.lr")
        assert key_tracked(tracked, path1, "Train.us.lr") and not key_tracked(
            tracked, path2, "Train.us.lr"
        )
        context.select("Train.us.layers")
        assert not key_tracked(
            tracked, path1, "Train.us.layers"
        ) and key_tracked(tracked, path2, "Train.us.layers")

    context = Context.clone(context)
    assert not context._tracked_data

    # let's see with an alias
    context["us"] = context["Train"]["us"]
    with context.track() as tracked:
        context.select("us")
        assert not (
            key_tracked(tracked, path1, "Train.us")
            or key_tracked(tracked, path2, "Train.us")
        )

        context.select("us.lr")
        assert key_tracked(tracked, path1, "Train.us.lr") and not key_tracked(
            tracked, path2, "Train.us.lr"
        )
        context.select("Train.us.layers")
        assert not key_tracked(
            tracked, path1, "Train.us.layers"
        ) and key_tracked(tracked, path2, "Train.us.layers")
Example #11
0
 def test_nobranch(self):
     fs = LocalFileSystem(url=self._root_dir)
     walk_result = []
     for root, dirs, files in fs.walk("."):
         dirs[:] = [i for i in dirs if i != ".git"]
         walk_result.append((root, dirs, files))
     self.assertWalkEqual(
         walk_result,
         [
             (".", ["data_dir"], ["bar", "тест", "code.py", "foo"]),
             (join("data_dir"), ["data_sub_dir"], ["data"]),
             (join("data_dir", "data_sub_dir"), [], ["data_sub"]),
         ],
     )
     self.assertWalkEqual(
         fs.walk(join("data_dir", "data_sub_dir")),
         [(join("data_dir", "data_sub_dir"), [], ["data_sub"])],
     )
Example #12
0
 def test(self):
     fs = LocalFileSystem(None, {"url": self._root_dir})
     self.assertWalkEqual(
         fs.walk(self._root_dir),
         [
             (
                 self._root_dir,
                 ["data_dir"],
                 ["code.py", "bar", "тест", "foo"],
             ),
             (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]),
             (
                 join(self._root_dir, "data_dir", "data_sub_dir"),
                 [],
                 ["data_sub"],
             ),
         ],
     )
Example #13
0
def test_local_fs_isfile(tmp_dir):
    tmp_dir.gen({
        "foo": "foo",
        "bar": "bar",
        "тест": "проверка",
        "code.py": "import sys\nimport shutil\n"
        "shutil.copyfile(sys.argv[1], sys.argv[2])",
        "data_dir": {
            "data": "data",
            "data_sub_dir": {
                "data_sub": "data_sub"
            },
        },
    })
    fs = LocalFileSystem()

    assert fs.isfile("foo")
    assert not fs.isfile("data_dir")
    assert not fs.isfile("not-existing-file")
Example #14
0
    def __init__(self, root_dir=None, tmp_dir=None, dvcignore=None):
        from diskcache import Cache

        super().__init__()

        self.tmp_dir = tmp_dir
        self.root_dir = root_dir
        self.dvcignore = dvcignore
        self.fs = LocalFileSystem()

        if not tmp_dir:
            return

        config = {
            "eviction_policy": "least-recently-used",
            "disk_pickle_protocol": 4,
        }
        self.links = Cache(directory=os.path.join(tmp_dir, "links"), **config)
        self.md5s = Cache(directory=os.path.join(tmp_dir, "md5s"), **config)
Example #15
0
def test_local_fs_open(tmp_dir):
    tmp_dir.gen({
        "foo": "foo",
        "bar": "bar",
        "тест": "проверка",
        "code.py": "import sys\nimport shutil\n"
        "shutil.copyfile(sys.argv[1], sys.argv[2])",
        "data_dir": {
            "data": "data",
            "data_sub_dir": {
                "data_sub": "data_sub"
            },
        },
    })
    fs = LocalFileSystem()

    with fs.open("foo", encoding="utf-8") as fobj:
        assert fobj.read() == "foo"
    with fs.open("тест", encoding="utf-8") as fobj:
        assert fobj.read() == "проверка"
Example #16
0
def test_hashed_stream_reader(tmp_dir):
    tmp_dir.gen({"foo": "foo"})

    foo = tmp_dir / "foo"
    with open(foo, "rb") as fobj:
        stream_reader = HashedStreamReader(fobj)
        assert stream_reader.read(3) == b"foo"

    hex_digest = file_md5(foo, LocalFileSystem(None, {}))
    assert stream_reader.is_text_file
    assert hex_digest == stream_reader.hash_info.value
Example #17
0
def test_hashed_stream_reader_as_chunks(tmp_dir):
    tmp_dir.gen({"foo": b"foo \x00" * 16})

    foo = tmp_dir / "foo"
    with open(foo, "rb") as fobj:
        stream_reader = HashedStreamReader(fobj)
        while True:
            chunk = stream_reader.read(16)
            if not chunk:
                break

    hex_digest = file_md5(foo, LocalFileSystem(None, {}))
    assert not stream_reader.is_text_file
    assert hex_digest == stream_reader.hash_info.value
Example #18
0
    def __init__(self, repo):

        super().__init__()

        self.repo = repo
        self.root_dir = repo.root_dir
        self.fs = LocalFileSystem(None, {"url": self.root_dir})

        state_config = repo.config.get("state", {})
        self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT)
        self.row_cleanup_quota = state_config.get(
            "row_cleanup_quota", self.STATE_ROW_CLEANUP_QUOTA
        )

        if not repo.tmp_dir:
            self.state_file = None
            return

        self.state_file = os.path.join(repo.tmp_dir, self.STATE_FILE)

        self.database = None
        self.cursor = None
        self.inserts = 0
Example #19
0
File: index.py Project: efiop/dvc
    def __init__(
        self,
        tmp_dir: "StrPath",
        name: str,
    ):  # pylint: disable=super-init-not-called
        from diskcache import Index

        from dvc.fs.local import LocalFileSystem
        from dvc.utils.fs import makedirs

        self.index_dir = os.path.join(tmp_dir, self.INDEX_DIR, name)
        makedirs(self.index_dir, exist_ok=True)
        self.fs = LocalFileSystem()
        self.index = Index(self.index_dir)
Example #20
0
    def test(self):
        fs = LocalFileSystem(url=self.root_dir)
        file_time, file_size = get_mtime_and_size(self.DATA, fs)
        dir_time, dir_size = get_mtime_and_size(self.DATA_DIR, fs)

        actual_file_size = os.path.getsize(self.DATA)
        actual_dir_size = os.path.getsize(self.DATA) + os.path.getsize(
            self.DATA_SUB)

        self.assertIs(type(file_time), str)
        self.assertIs(type(file_size), int)
        self.assertEqual(file_size, actual_file_size)
        self.assertIs(type(dir_time), str)
        self.assertIs(type(dir_size), int)
        self.assertEqual(dir_size, actual_dir_size)
Example #21
0
def test_path_object_and_str_are_valid_types_get_mtime_and_size(tmp_dir):
    tmp_dir.gen(
        {"dir": {"dir_file": "dir file content"}, "file": "file_content"}
    )
    fs = LocalFileSystem(None, {"url": os.fspath(tmp_dir)}, use_dvcignore=True)

    time, size = get_mtime_and_size("dir", fs)
    object_time, object_size = get_mtime_and_size(PathInfo("dir"), fs)
    assert time == object_time
    assert size == object_size

    time, size = get_mtime_and_size("file", fs)
    object_time, object_size = get_mtime_and_size(PathInfo("file"), fs)
    assert time == object_time
    assert size == object_size
Example #22
0
def test_mtime_and_size(tmp_dir):
    tmp_dir.gen({"dir/data": "data", "dir/subdir/subdata": "subdata"})
    fs = LocalFileSystem(url=tmp_dir)
    file_time, file_size = get_mtime_and_size("dir/data", fs)
    dir_time, dir_size = get_mtime_and_size("dir", fs)

    actual_file_size = os.path.getsize("dir/data")
    actual_dir_size = os.path.getsize("dir/data") + os.path.getsize(
        "dir/subdir/subdata")

    assert isinstance(file_time, str)
    assert isinstance(file_size, int)
    assert file_size == actual_file_size
    assert isinstance(dir_time, str)
    assert isinstance(dir_size, int)
    assert dir_size == actual_dir_size
Example #23
0
    def test(self):
        fs = LocalFileSystem(None, {"url": self.root_dir}, use_dvcignore=True)
        file_time, file_size = get_mtime_and_size(self.DATA, fs)
        dir_time, dir_size = get_mtime_and_size(self.DATA_DIR, fs)

        actual_file_size = os.path.getsize(self.DATA)
        actual_dir_size = os.path.getsize(self.DATA) + os.path.getsize(
            self.DATA_SUB
        )

        self.assertIs(type(file_time), str)
        self.assertIs(type(file_size), str)
        self.assertEqual(file_size, str(actual_file_size))
        self.assertIs(type(dir_time), str)
        self.assertIs(type(dir_size), str)
        self.assertEqual(dir_size, str(actual_dir_size))
Example #24
0
def test_track(tmp_dir):
    d = {
        "lst": [
            {
                "foo0": "foo0",
                "bar0": "bar0"
            },
            {
                "foo1": "foo1",
                "bar1": "bar1"
            },
        ],
        "dct": {
            "foo": "foo",
            "bar": "bar",
            "baz": "baz"
        },
    }
    fs = LocalFileSystem()
    path = tmp_dir / "params.yaml"
    dump_yaml(path, d, fs)

    context = Context.load_from(fs, path)

    def key_tracked(d, key):
        assert len(d) == 1
        return key in d[relpath(path)]

    with context.track() as tracked:
        context.select("lst")
        assert key_tracked(tracked, "lst")

        context.select("dct")
        assert not key_tracked(tracked, "dct")

        context.select("dct.foo")
        assert key_tracked(tracked, "dct.foo")

        # Currently, it's unable to track dictionaries, as it can be merged
        # from multiple sources.
        context.select("lst.0")
        assert not key_tracked(tracked, "lst.0")

        # FIXME: either support tracking list values in ParamsDependency
        # or, prevent this from being tracked.
        context.select("lst.0.foo0")
        assert key_tracked(tracked, "lst.0.foo0")
Example #25
0
def test_hashed_stream_reader_compatibility(tmp_dir, contents):
    # Always read more than the DEFAULT_CHUNK_SIZE (512 bytes).
    # This imitates the read actions performed by upload_fobj.
    chunk_size = DEFAULT_CHUNK_SIZE * 2

    tmp_dir.gen("data", contents)
    data = tmp_dir / "data"

    with open(data, "rb") as fobj:
        stream_reader = HashedStreamReader(fobj)
        stream_reader.read(chunk_size)

    local_fs = LocalFileSystem()
    hex_digest = file_md5(data, local_fs)

    assert stream_reader.is_text_file is istextfile(data, local_fs)
    assert stream_reader.hash_info.value == hex_digest
Example #26
0
    def __init__(self,
                 tmp_dir: "StrPath",
                 name: str,
                 dir_suffix: Optional[str] = None):  # pylint: disable=super-init-not-called
        from diskcache import Index

        from dvc.fs.local import LocalFileSystem
        from dvc.utils.fs import makedirs

        self.index_dir = os.path.join(tmp_dir, self.INDEX_DIR, name)
        makedirs(self.index_dir, exist_ok=True)
        self.fs = LocalFileSystem()
        self.index = Index(self.index_dir)

        if not dir_suffix:
            dir_suffix = self.fs.CHECKSUM_DIR_SUFFIX
        self.dir_suffix = dir_suffix
Example #27
0
def test_path_object_and_str_are_valid_types_get_mtime_and_size(tmp_dir):
    tmp_dir.gen({
        "dir": {
            "dir_file": "dir file content"
        },
        "file": "file_content"
    })
    fs = LocalFileSystem(url=os.fspath(tmp_dir))

    time, size = get_mtime_and_size("dir", fs)
    object_time, object_size = get_mtime_and_size("dir", fs)
    assert time == object_time
    assert size == object_size

    time, size = get_mtime_and_size("file", fs)
    object_time, object_size = get_mtime_and_size("file", fs)
    assert time == object_time
    assert size == object_size
Example #28
0
def test_hashed_stream_reader(tmp_dir):
    tmp_dir.gen({"foo": "foo"})

    foo = tmp_dir / "foo"
    with open(foo, "rb") as fobj:
        stream_reader = HashedStreamReader(fobj)

        assert stream_reader.readable()
        assert not stream_reader.seekable()

        assert stream_reader.read(2) == b"fo"
        assert stream_reader.tell() == 2

        assert stream_reader.read(1) == b"o"
        assert stream_reader.tell() == 3

    hex_digest = file_md5(foo, LocalFileSystem())
    assert stream_reader.is_text_file
    assert hex_digest == stream_reader.hash_info.value
Example #29
0
    def __init__(
        self,
        tmp_dir: "StrPath",
        name: str,
    ):  # pylint: disable=super-init-not-called
        from diskcache import Cache, Index

        from dvc.fs.local import LocalFileSystem
        from dvc.utils.fs import makedirs

        self.index_dir = os.path.join(tmp_dir, self.INDEX_DIR, name)
        makedirs(self.index_dir, exist_ok=True)
        self.fs = LocalFileSystem()
        self.index = Index.fromcache(
            Cache(
                self.index_dir,
                disk_pickle_protocol=4,
                eviction_policy="none",
            ))
Example #30
0
def test_status_download_optimization(mocker, dvc):
    """When comparing the status to pull a remote cache,
    And the desired files to fetch are already on the local cache,
    Don't check the existence of the desired files on the remote cache
    """
    from dvc.objects.status import compare_status

    odb = LocalObjectDB(LocalFileSystem(), PathInfo("."))
    obj_ids = {
        HashInfo("md5", "acbd18db4cc2f85cedef654fccc4a4d8"),
        HashInfo("md5", "37b51d194a7513e45b56f6524f2d51f2"),
    }

    local_exists = [hash_info.value for hash_info in obj_ids]
    mocker.patch.object(odb, "hashes_exist", return_value=local_exists)

    src_odb = mocker.Mock()

    compare_status(src_odb, odb, obj_ids, check_deleted=False)
    assert src_odb.hashes_exist.call_count == 0