def test_get_cloud_fs_validate(tmp_dir, dvc): tmp_dir.add_remote( name="base", url="ssh://example.com/path", default=False ) tmp_dir.add_remote( name="first", config={"url": "remote://base/first", "type": "symlink"}, default=False, ) tmp_dir.add_remote( name="second", config={"url": "remote://first/second", "oss_key_id": "mykey"}, default=False, ) assert get_cloud_fs(dvc, name="base").config == { "url": "ssh://example.com/path" } assert get_cloud_fs(dvc, name="first").config == { "url": "ssh://example.com/path/first", "type": ["symlink"], } with pytest.raises(ConfigError): get_cloud_fs(dvc, name="second")
def test_get_cloud_fs(tmp_dir, dvc): tmp_dir.add_remote(name="base", url="s3://bucket/path", default=False) tmp_dir.add_remote(name="first", url="remote://base/first", default=False) tmp_dir.add_remote( name="second", url="remote://first/second", default=False ) base = CloudURLInfo("s3://bucket/path") first = base / "first" second = first / "second" assert get_cloud_fs(dvc, name="base").path_info == base assert get_cloud_fs(dvc, name="first").path_info == first assert get_cloud_fs(dvc, name="second").path_info == second
def _init_odb(self, name): from dvc.data.db import get_odb from dvc.fs import get_cloud_fs cls, config, fs_path = get_cloud_fs(self.repo, name=name) config["tmp_dir"] = self.repo.index_db_dir return get_odb(cls(**config), fs_path, **config)
def test_remote_with_jobs(dvc, base_url, fs_cls): scheme = "http" + ("s" if fs_cls is WebDAVSFileSystem else "") remote_config = {"url": base_url} dvc.config["remote"]["dav"] = remote_config cls, config, _ = get_cloud_fs(dvc, name="dav") assert config["user"] == user assert f"{scheme}://{user}@example.com" in config["host"] assert cls is fs_cls # config from remote takes priority remote_config.update({"user": "******"}) cls, config, _ = get_cloud_fs(dvc, name="dav") assert config["user"] == "admin" assert f"{scheme}://{user}@example.com" in config["host"] assert cls is fs_cls
def _init_odb(self, name): from dvc.fs import get_cloud_fs from dvc.objects.db import get_odb cls, config, path_info = get_cloud_fs(self.repo, name=name) config["tmp_dir"] = self.repo.tmp_dir return get_odb(cls(**config), path_info, **config)
def test_remote_without_hash_jobs(dvc): dvc.config["remote"]["without_hash_jobs"] = {"url": "s3://bucket/name"} dvc.config["core"]["checksum_jobs"] = 200 cls, config, _ = get_cloud_fs(dvc, name="without_hash_jobs") fs = cls(**config) assert fs.hash_jobs == 200
def test_fs_ls(dvc, cloud): cloud.gen({ "directory": { "foo": "foo", "bar": "bar", "baz": { "quux": "quux", "egg": { "foo": "foo" } }, "empty": {}, } }) cls, config, path = get_cloud_fs(dvc, **cloud.config) fs = cls(**config) path = os.path.join(path, "directory") assert { os.path.basename(file_key.rstrip("/")) for file_key in fs.ls(path) } == { "foo", "bar", "baz", "empty", } assert set(fs.ls(fs.path.join(path, "empty"))) == set() assert {(detail["type"], os.path.basename(detail["name"].rstrip("/"))) for detail in fs.ls(fs.path.join(path, "baz"), detail=True) } == {("file", "quux"), ("directory", "egg")}
def test_fs_getsize(dvc, cloud): cloud.gen({"data": {"foo": "foo"}, "baz": "baz baz"}) fs = get_cloud_fs(dvc, **cloud.config) path_info = fs.path_info assert fs.getsize(path_info / "baz") == 7 assert fs.getsize(path_info / "data" / "foo") == 3
def test_fs_ls(dvc, cloud): cloud.gen({ "directory": { "foo": "foo", "bar": "bar", "baz": { "quux": "quux", "egg": { "foo": "foo" } }, "empty": {}, } }) fs = get_cloud_fs(dvc, **cloud.config) path_info = cloud / "directory" assert {os.path.basename(file_key) for file_key in fs.ls(path_info)} == { "foo", "bar", "baz", "empty", } assert set(fs.ls(path_info / "empty")) == set() assert {(detail["type"], os.path.basename(detail["name"])) for detail in fs.ls(path_info / "baz", detail=True) } == {("file", "quux"), ("directory", "egg")}
def test_fs_getsize(dvc, cloud): cloud.gen({"data": {"foo": "foo"}, "baz": "baz baz"}) cls, config, path = get_cloud_fs(dvc, **cloud.config) fs = cls(**config) assert fs.getsize(fs.path.join(path, "baz")) == 7 assert fs.getsize(fs.path.join(path, "data", "foo")) == 3
def _process_stages(repo, sub_targets, stages, no_commit, pbar, to_remote, to_cache, **kwargs): link_failures = [] from dvc.dvcfile import Dvcfile from ..output.base import OutputDoesNotExistError if to_remote or to_cache: # Already verified in the add() (stage, ) = stages (target, ) = sub_targets (out, ) = stage.outs if to_remote: out.hash_info = repo.cloud.transfer( target, jobs=kwargs.get("jobs"), remote=kwargs.get("remote"), command="add", ) else: from dvc.fs import get_cloud_fs from dvc.objects import transfer from_fs = get_cloud_fs(repo, url=target) out.hash_info = transfer( out.odb, from_fs, from_fs.path_info, jobs=kwargs.get("jobs"), ) out.checkout() Dvcfile(repo, stage.path).dump(stage) return link_failures with Tqdm( total=len(stages), desc="Processing", unit="file", disable=len(stages) == 1, ) as pbar_stages: for stage in stages: try: stage.save() except OutputDoesNotExistError: pbar.n -= 1 raise try: if not no_commit: stage.commit() except CacheLinkError: link_failures.append(stage) Dvcfile(repo, stage.path).dump(stage) pbar_stages.update() return link_failures
def test_fs_find_recursive(dvc, cloud): cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}}) fs = get_cloud_fs(dvc, **cloud.config) path_info = fs.path_info assert { os.path.basename(file_key) for file_key in fs.find(path_info / "data") } == {"foo", "baz", "quux"}
def _get_odb(repo, settings): from dvc.fs import get_cloud_fs if not settings: return None fs = get_cloud_fs(repo, **settings) return get_odb(fs)
def test_get_cloud_fs(tmp_dir, dvc): tmp_dir.add_remote(name="base", url="s3://bucket/path", default=False) tmp_dir.add_remote(name="first", url="remote://base/first", default=False) tmp_dir.add_remote(name="second", url="remote://first/second", default=False) base = "bucket/path" first = f"{base}/first" second = f"{first}/second" _, _, path = get_cloud_fs(dvc, name="base") assert path == base _, _, path = get_cloud_fs(dvc, name="first") assert path == first _, _, path = get_cloud_fs(dvc, name="second") assert path == second
def _get_odb(repo, settings): from dvc.fs import get_cloud_fs if not settings: return None cls, config, path_info = get_cloud_fs(repo, **settings) return get_odb(cls(**config), path_info, state=repo.state, **config)
def test_fs_ls_with_etag(dvc, cloud): cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}}) fs = get_cloud_fs(dvc, **cloud.config) path_info = fs.path_info for details in fs.ls(path_info / "data", recursive=True, detail=True): assert (fs.info(path_info.replace( path=details["name"]))["etag"] == details["etag"])
def test_fs_find_with_etag(dvc, cloud): cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}}) cls, config, path_info = get_cloud_fs(dvc, **cloud.config) fs = cls(**config) for details in fs.find(path_info / "data", detail=True): assert (fs.info(path_info.replace( path=details["name"]))["etag"] == details["etag"])
def test_remote_with_hash_jobs(dvc): dvc.config["remote"]["with_hash_jobs"] = { "url": "s3://bucket/name", "checksum_jobs": 100, } dvc.config["core"]["checksum_jobs"] = 200 fs = get_cloud_fs(dvc, name="with_hash_jobs") assert fs.hash_jobs == 100
def _get_odb(repo, settings): from dvc.fs import get_cloud_fs if not settings: return None cls, config, fs_path = get_cloud_fs(repo, **settings) config["tmp_dir"] = repo.tmp_dir return get_odb(cls(**config), fs_path, state=repo.state, **config)
def test_fs_find(dvc, cloud): cloud.gen({"data": {"foo": "foo", "bar": {"baz": "baz"}, "quux": "quux"}}) cls, config, path_info = get_cloud_fs(dvc, **cloud.config) fs = cls(**config) assert { os.path.basename(file_key) for file_key in fs.find(path_info / "data") } == {"foo", "baz", "quux"}
def test_remote_with_jobs(dvc): dvc.config["remote"]["with_jobs"] = { "url": "s3://bucket/name", "jobs": 100, } dvc.config["core"]["jobs"] = 200 cls, config, _ = get_cloud_fs(dvc, name="with_jobs") fs = cls(**config) assert fs.jobs == 100
def test_upload_callback(tmp_dir, dvc, cloud): tmp_dir.gen("foo", "foo") cls, config, _ = get_cloud_fs(dvc, **cloud.config) fs = cls(**config) expected_size = os.path.getsize(tmp_dir / "foo") callback = fsspec.Callback() fs.upload(tmp_dir / "foo", cloud / "foo", callback=callback) assert callback.size == expected_size assert callback.value == expected_size
def test_download_dir_callback(tmp_dir, dvc, cloud): cls, config, _ = get_cloud_fs(dvc, **cloud.config) fs = cls(**config) cloud.gen({"dir": {"foo": "foo", "bar": "bar"}}) callback = fsspec.Callback() fs.download(cloud / "dir", tmp_dir / "dir", callback=callback) assert callback.size == 2 assert callback.value == 2 assert (tmp_dir / "dir").read_text() == {"foo": "foo", "bar": "bar"}
def test_download_callback(tmp_dir, dvc, cloud): cls, config, _ = get_cloud_fs(dvc, **cloud.config) fs = cls(**config) fs.upload(io.BytesIO(b"foo"), cloud / "foo") expected_size = fs.getsize(cloud / "foo") callback = fsspec.Callback() fs.download_file(cloud / "foo", tmp_dir / "foo", callback=callback) assert callback.size == expected_size assert callback.value == expected_size assert (tmp_dir / "foo").read_text() == "foo"
def test_fs_upload_fobj(dvc, tmp_dir, cloud): tmp_dir.gen("foo", "foo") fs = get_cloud_fs(dvc, **cloud.config) from_info = tmp_dir / "foo" to_info = fs.path_info / "foo" with open(from_info, "rb") as stream: fs.upload_fobj(stream, to_info) assert fs.exists(to_info) with fs.open(to_info, "rb") as stream: assert stream.read() == b"foo"
def test_fs_makedirs_on_upload_and_copy(dvc, cloud): cls, config, _ = get_cloud_fs(dvc, **cloud.config) fs = cls(**config) with io.BytesIO(b"foo") as stream: fs.upload_fobj(stream, cloud / "dir" / "foo") assert fs.isdir(cloud / "dir") assert fs.exists(cloud / "dir" / "foo") fs.copy(cloud / "dir" / "foo", cloud / "dir2" / "foo") assert fs.isdir(cloud / "dir2") assert fs.exists(cloud / "dir2" / "foo")
def test_fs_fsspec_path_management(dvc, cloud): cloud.gen({"foo": "foo", "data": {"bar": "bar", "baz": {"foo": "foo"}}}) fs = get_cloud_fs(dvc, **cloud.config) root = cloud.parents[len(cloud.parents) - 1] bucket_details = fs.info(root) # special conditions: name always points to the bucket name assert bucket_details["name"] == root.bucket assert bucket_details["type"] == "directory" data = cloud / "data" data_details = fs.info(data) assert data_details["name"].rstrip("/") == data.path assert data_details["type"] == "directory"
def transfer(self, source, jobs=None, remote=None, command=None): """Transfer data items in a cloud-agnostic way. Args: source (str): url for the source location. jobs (int): number of jobs that can be running simultaneously. remote (dvc.remote.base.BaseRemote): optional remote to compare cache to. By default remote from core.remote config option is used. command (str): the command which is benefitting from this function (to be used for reporting better error messages). """ from dvc.fs import get_cloud_fs from_fs = get_cloud_fs(self.repo, url=source) remote = self.get_remote(remote, command) return remote.transfer(from_fs, from_fs.path_info, jobs=jobs)
def test_download_callback(tmp_dir, dvc, cloud, local_cloud): cls, config, _ = get_cloud_fs(dvc, **cloud.config) fs = cls(**config) (tmp_dir / "to_upload").write_text("foo") fs.upload((tmp_dir / "to_upload").fs_path, (cloud / "foo").fs_path) expected_size = fs.getsize((cloud / "foo").fs_path) callback = fsspec.Callback() fs.download_file( (cloud / "foo").fs_path, (tmp_dir / "foo").fs_path, callback=callback, ) assert callback.size == expected_size assert callback.value == expected_size assert (tmp_dir / "foo").read_text() == "foo"
def test_hardlink_optimization(tmp_dir, dvc, ssh): from dvc.fs import get_cloud_fs cls, config, path_info = get_cloud_fs(dvc, **ssh.config) fs = cls(**config) assert isinstance(fs, SSHFileSystem) from_info = path_info / "empty" to_info = path_info / "link" with fs.open(from_info, "wb"): pass if os.name == "nt": link_path = "c:" + to_info.path.replace("/", "\\") else: link_path = to_info.path fs.hardlink(from_info, to_info) assert not System.is_hardlink(link_path)