Exemple #1
0
def test_get_cloud_fs_validate(tmp_dir, dvc):
    tmp_dir.add_remote(
        name="base", url="ssh://example.com/path", default=False
    )
    tmp_dir.add_remote(
        name="first",
        config={"url": "remote://base/first", "type": "symlink"},
        default=False,
    )
    tmp_dir.add_remote(
        name="second",
        config={"url": "remote://first/second", "oss_key_id": "mykey"},
        default=False,
    )

    assert get_cloud_fs(dvc, name="base").config == {
        "url": "ssh://example.com/path"
    }
    assert get_cloud_fs(dvc, name="first").config == {
        "url": "ssh://example.com/path/first",
        "type": ["symlink"],
    }

    with pytest.raises(ConfigError):
        get_cloud_fs(dvc, name="second")
Exemple #2
0
def test_get_cloud_fs(tmp_dir, dvc):
    tmp_dir.add_remote(name="base", url="s3://bucket/path", default=False)
    tmp_dir.add_remote(name="first", url="remote://base/first", default=False)
    tmp_dir.add_remote(
        name="second", url="remote://first/second", default=False
    )

    base = CloudURLInfo("s3://bucket/path")
    first = base / "first"
    second = first / "second"

    assert get_cloud_fs(dvc, name="base").path_info == base
    assert get_cloud_fs(dvc, name="first").path_info == first
    assert get_cloud_fs(dvc, name="second").path_info == second
Exemple #3
0
    def _init_odb(self, name):
        from dvc.data.db import get_odb
        from dvc.fs import get_cloud_fs

        cls, config, fs_path = get_cloud_fs(self.repo, name=name)
        config["tmp_dir"] = self.repo.index_db_dir
        return get_odb(cls(**config), fs_path, **config)
Exemple #4
0
def test_remote_with_jobs(dvc, base_url, fs_cls):
    scheme = "http" + ("s" if fs_cls is WebDAVSFileSystem else "")
    remote_config = {"url": base_url}

    dvc.config["remote"]["dav"] = remote_config
    cls, config, _ = get_cloud_fs(dvc, name="dav")
    assert config["user"] == user
    assert f"{scheme}://{user}@example.com" in config["host"]
    assert cls is fs_cls

    # config from remote takes priority
    remote_config.update({"user": "******"})
    cls, config, _ = get_cloud_fs(dvc, name="dav")
    assert config["user"] == "admin"
    assert f"{scheme}://{user}@example.com" in config["host"]
    assert cls is fs_cls
Exemple #5
0
    def _init_odb(self, name):
        from dvc.fs import get_cloud_fs
        from dvc.objects.db import get_odb

        cls, config, path_info = get_cloud_fs(self.repo, name=name)
        config["tmp_dir"] = self.repo.tmp_dir
        return get_odb(cls(**config), path_info, **config)
Exemple #6
0
def test_remote_without_hash_jobs(dvc):
    dvc.config["remote"]["without_hash_jobs"] = {"url": "s3://bucket/name"}
    dvc.config["core"]["checksum_jobs"] = 200

    cls, config, _ = get_cloud_fs(dvc, name="without_hash_jobs")
    fs = cls(**config)
    assert fs.hash_jobs == 200
Exemple #7
0
def test_fs_ls(dvc, cloud):
    cloud.gen({
        "directory": {
            "foo": "foo",
            "bar": "bar",
            "baz": {
                "quux": "quux",
                "egg": {
                    "foo": "foo"
                }
            },
            "empty": {},
        }
    })
    cls, config, path = get_cloud_fs(dvc, **cloud.config)
    fs = cls(**config)
    path = os.path.join(path, "directory")

    assert {
        os.path.basename(file_key.rstrip("/"))
        for file_key in fs.ls(path)
    } == {
        "foo",
        "bar",
        "baz",
        "empty",
    }
    assert set(fs.ls(fs.path.join(path, "empty"))) == set()
    assert {(detail["type"], os.path.basename(detail["name"].rstrip("/")))
            for detail in fs.ls(fs.path.join(path, "baz"), detail=True)
            } == {("file", "quux"), ("directory", "egg")}
Exemple #8
0
def test_fs_getsize(dvc, cloud):
    cloud.gen({"data": {"foo": "foo"}, "baz": "baz baz"})
    fs = get_cloud_fs(dvc, **cloud.config)
    path_info = fs.path_info

    assert fs.getsize(path_info / "baz") == 7
    assert fs.getsize(path_info / "data" / "foo") == 3
Exemple #9
0
def test_fs_ls(dvc, cloud):
    cloud.gen({
        "directory": {
            "foo": "foo",
            "bar": "bar",
            "baz": {
                "quux": "quux",
                "egg": {
                    "foo": "foo"
                }
            },
            "empty": {},
        }
    })
    fs = get_cloud_fs(dvc, **cloud.config)
    path_info = cloud / "directory"

    assert {os.path.basename(file_key)
            for file_key in fs.ls(path_info)} == {
                "foo",
                "bar",
                "baz",
                "empty",
            }
    assert set(fs.ls(path_info / "empty")) == set()
    assert {(detail["type"], os.path.basename(detail["name"]))
            for detail in fs.ls(path_info / "baz", detail=True)
            } == {("file", "quux"), ("directory", "egg")}
Exemple #10
0
def test_fs_getsize(dvc, cloud):
    cloud.gen({"data": {"foo": "foo"}, "baz": "baz baz"})
    cls, config, path = get_cloud_fs(dvc, **cloud.config)
    fs = cls(**config)

    assert fs.getsize(fs.path.join(path, "baz")) == 7
    assert fs.getsize(fs.path.join(path, "data", "foo")) == 3
Exemple #11
0
def _process_stages(repo, sub_targets, stages, no_commit, pbar, to_remote,
                    to_cache, **kwargs):
    link_failures = []
    from dvc.dvcfile import Dvcfile

    from ..output.base import OutputDoesNotExistError

    if to_remote or to_cache:
        # Already verified in the add()
        (stage, ) = stages
        (target, ) = sub_targets
        (out, ) = stage.outs

        if to_remote:
            out.hash_info = repo.cloud.transfer(
                target,
                jobs=kwargs.get("jobs"),
                remote=kwargs.get("remote"),
                command="add",
            )
        else:
            from dvc.fs import get_cloud_fs
            from dvc.objects import transfer

            from_fs = get_cloud_fs(repo, url=target)
            out.hash_info = transfer(
                out.odb,
                from_fs,
                from_fs.path_info,
                jobs=kwargs.get("jobs"),
            )
            out.checkout()

        Dvcfile(repo, stage.path).dump(stage)
        return link_failures

    with Tqdm(
            total=len(stages),
            desc="Processing",
            unit="file",
            disable=len(stages) == 1,
    ) as pbar_stages:
        for stage in stages:
            try:
                stage.save()
            except OutputDoesNotExistError:
                pbar.n -= 1
                raise

            try:
                if not no_commit:
                    stage.commit()
            except CacheLinkError:
                link_failures.append(stage)

            Dvcfile(repo, stage.path).dump(stage)
            pbar_stages.update()

    return link_failures
Exemple #12
0
def test_fs_find_recursive(dvc, cloud):
    cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}})
    fs = get_cloud_fs(dvc, **cloud.config)
    path_info = fs.path_info

    assert {
        os.path.basename(file_key) for file_key in fs.find(path_info / "data")
    } == {"foo", "baz", "quux"}
Exemple #13
0
def _get_odb(repo, settings):
    from dvc.fs import get_cloud_fs

    if not settings:
        return None

    fs = get_cloud_fs(repo, **settings)
    return get_odb(fs)
Exemple #14
0
def test_get_cloud_fs(tmp_dir, dvc):
    tmp_dir.add_remote(name="base", url="s3://bucket/path", default=False)
    tmp_dir.add_remote(name="first", url="remote://base/first", default=False)
    tmp_dir.add_remote(name="second",
                       url="remote://first/second",
                       default=False)

    base = "bucket/path"
    first = f"{base}/first"
    second = f"{first}/second"

    _, _, path = get_cloud_fs(dvc, name="base")
    assert path == base
    _, _, path = get_cloud_fs(dvc, name="first")
    assert path == first
    _, _, path = get_cloud_fs(dvc, name="second")
    assert path == second
Exemple #15
0
def _get_odb(repo, settings):
    from dvc.fs import get_cloud_fs

    if not settings:
        return None

    cls, config, path_info = get_cloud_fs(repo, **settings)
    return get_odb(cls(**config), path_info, state=repo.state, **config)
Exemple #16
0
def test_fs_ls_with_etag(dvc, cloud):
    cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}})
    fs = get_cloud_fs(dvc, **cloud.config)
    path_info = fs.path_info

    for details in fs.ls(path_info / "data", recursive=True, detail=True):
        assert (fs.info(path_info.replace(
            path=details["name"]))["etag"] == details["etag"])
Exemple #17
0
def test_fs_find_with_etag(dvc, cloud):
    cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}})
    cls, config, path_info = get_cloud_fs(dvc, **cloud.config)
    fs = cls(**config)

    for details in fs.find(path_info / "data", detail=True):
        assert (fs.info(path_info.replace(
            path=details["name"]))["etag"] == details["etag"])
Exemple #18
0
def test_remote_with_hash_jobs(dvc):
    dvc.config["remote"]["with_hash_jobs"] = {
        "url": "s3://bucket/name",
        "checksum_jobs": 100,
    }
    dvc.config["core"]["checksum_jobs"] = 200

    fs = get_cloud_fs(dvc, name="with_hash_jobs")
    assert fs.hash_jobs == 100
Exemple #19
0
def _get_odb(repo, settings):
    from dvc.fs import get_cloud_fs

    if not settings:
        return None

    cls, config, fs_path = get_cloud_fs(repo, **settings)
    config["tmp_dir"] = repo.tmp_dir
    return get_odb(cls(**config), fs_path, state=repo.state, **config)
Exemple #20
0
def test_fs_find(dvc, cloud):
    cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}})
    cls, config, path_info = get_cloud_fs(dvc, **cloud.config)
    fs = cls(**config)

    assert {
        os.path.basename(file_key)
        for file_key in fs.find(path_info / "data")
    } == {"foo", "baz", "quux"}
Exemple #21
0
def test_remote_with_jobs(dvc):
    dvc.config["remote"]["with_jobs"] = {
        "url": "s3://bucket/name",
        "jobs": 100,
    }
    dvc.config["core"]["jobs"] = 200

    cls, config, _ = get_cloud_fs(dvc, name="with_jobs")
    fs = cls(**config)
    assert fs.jobs == 100
Exemple #22
0
def test_upload_callback(tmp_dir, dvc, cloud):
    tmp_dir.gen("foo", "foo")
    cls, config, _ = get_cloud_fs(dvc, **cloud.config)
    fs = cls(**config)
    expected_size = os.path.getsize(tmp_dir / "foo")

    callback = fsspec.Callback()
    fs.upload(tmp_dir / "foo", cloud / "foo", callback=callback)

    assert callback.size == expected_size
    assert callback.value == expected_size
Exemple #23
0
def test_download_dir_callback(tmp_dir, dvc, cloud):
    cls, config, _ = get_cloud_fs(dvc, **cloud.config)
    fs = cls(**config)
    cloud.gen({"dir": {"foo": "foo", "bar": "bar"}})

    callback = fsspec.Callback()
    fs.download(cloud / "dir", tmp_dir / "dir", callback=callback)

    assert callback.size == 2
    assert callback.value == 2
    assert (tmp_dir / "dir").read_text() == {"foo": "foo", "bar": "bar"}
Exemple #24
0
def test_download_callback(tmp_dir, dvc, cloud):
    cls, config, _ = get_cloud_fs(dvc, **cloud.config)
    fs = cls(**config)
    fs.upload(io.BytesIO(b"foo"), cloud / "foo")
    expected_size = fs.getsize(cloud / "foo")

    callback = fsspec.Callback()
    fs.download_file(cloud / "foo", tmp_dir / "foo", callback=callback)

    assert callback.size == expected_size
    assert callback.value == expected_size
    assert (tmp_dir / "foo").read_text() == "foo"
Exemple #25
0
def test_fs_upload_fobj(dvc, tmp_dir, cloud):
    tmp_dir.gen("foo", "foo")
    fs = get_cloud_fs(dvc, **cloud.config)

    from_info = tmp_dir / "foo"
    to_info = fs.path_info / "foo"

    with open(from_info, "rb") as stream:
        fs.upload_fobj(stream, to_info)

    assert fs.exists(to_info)
    with fs.open(to_info, "rb") as stream:
        assert stream.read() == b"foo"
Exemple #26
0
def test_fs_makedirs_on_upload_and_copy(dvc, cloud):
    cls, config, _ = get_cloud_fs(dvc, **cloud.config)
    fs = cls(**config)

    with io.BytesIO(b"foo") as stream:
        fs.upload_fobj(stream, cloud / "dir" / "foo")

    assert fs.isdir(cloud / "dir")
    assert fs.exists(cloud / "dir" / "foo")

    fs.copy(cloud / "dir" / "foo", cloud / "dir2" / "foo")
    assert fs.isdir(cloud / "dir2")
    assert fs.exists(cloud / "dir2" / "foo")
Exemple #27
0
def test_fs_fsspec_path_management(dvc, cloud):
    cloud.gen({"foo": "foo", "data": {"bar": "bar", "baz": {"foo": "foo"}}})
    fs = get_cloud_fs(dvc, **cloud.config)

    root = cloud.parents[len(cloud.parents) - 1]
    bucket_details = fs.info(root)

    # special conditions: name always points to the bucket name
    assert bucket_details["name"] == root.bucket
    assert bucket_details["type"] == "directory"

    data = cloud / "data"
    data_details = fs.info(data)
    assert data_details["name"].rstrip("/") == data.path
    assert data_details["type"] == "directory"
Exemple #28
0
    def transfer(self, source, jobs=None, remote=None, command=None):
        """Transfer data items in a cloud-agnostic way.

        Args:
            source (str): url for the source location.
            jobs (int): number of jobs that can be running simultaneously.
            remote (dvc.remote.base.BaseRemote): optional remote to compare
                cache to. By default remote from core.remote config option
                is used.
            command (str): the command which is benefitting from this function
                (to be used for reporting better error messages).
        """
        from dvc.fs import get_cloud_fs

        from_fs = get_cloud_fs(self.repo, url=source)
        remote = self.get_remote(remote, command)
        return remote.transfer(from_fs, from_fs.path_info, jobs=jobs)
Exemple #29
0
def test_download_callback(tmp_dir, dvc, cloud, local_cloud):
    cls, config, _ = get_cloud_fs(dvc, **cloud.config)
    fs = cls(**config)

    (tmp_dir / "to_upload").write_text("foo")
    fs.upload((tmp_dir / "to_upload").fs_path, (cloud / "foo").fs_path)
    expected_size = fs.getsize((cloud / "foo").fs_path)

    callback = fsspec.Callback()
    fs.download_file(
        (cloud / "foo").fs_path,
        (tmp_dir / "foo").fs_path,
        callback=callback,
    )

    assert callback.size == expected_size
    assert callback.value == expected_size
    assert (tmp_dir / "foo").read_text() == "foo"
Exemple #30
0
def test_hardlink_optimization(tmp_dir, dvc, ssh):
    from dvc.fs import get_cloud_fs

    cls, config, path_info = get_cloud_fs(dvc, **ssh.config)
    fs = cls(**config)
    assert isinstance(fs, SSHFileSystem)

    from_info = path_info / "empty"
    to_info = path_info / "link"

    with fs.open(from_info, "wb"):
        pass

    if os.name == "nt":
        link_path = "c:" + to_info.path.replace("/", "\\")
    else:
        link_path = to_info.path

    fs.hardlink(from_info, to_info)
    assert not System.is_hardlink(link_path)