def test_hadoop_fs_checksum(mocker): mock_proc = mocker.Mock() out = b"/path/to/file\tMD5-of-0MD5-of-512CRC32C\t123456789" err = b"" mock_proc.configure_mock(**{ "communicate.return_value": (out, err), "returncode": 0 }) mock_popen = mocker.patch("subprocess.Popen", return_value=mock_proc) path_info = URLInfo("hdfs://example.com:1234/path/to/file") assert _hadoop_fs_checksum(path_info) == "123456789" mock_popen.assert_called_once_with( "hadoop fs -checksum hdfs://example.com:1234/path/to/file", shell=True, close_fds=os.name != "nt", executable=os.getenv("SHELL") if os.name != "nt" else None, env=os.environ, stdin=-1, stdout=-1, stderr=-1, ) assert mock_proc.communicate.called
def test_ssh_dir_out(dvc_repo): if not _should_test_ssh(): pytest.skip() # Set up remote and cache remote_url = get_ssh_url() assert main(["remote", "add", "upstream", remote_url]) == 0 cache_url = get_ssh_url() assert main(["remote", "add", "sshcache", cache_url]) == 0 assert main(["config", "cache.ssh", "sshcache"]) == 0 # Recreating to reread configs repo = DvcRepo(dvc_repo.root_dir) url_info = URLInfo(remote_url) mkdir_cmd = "mkdir dir-out;cd dir-out;echo 1 > 1.txt; echo 2 > 2.txt" repo.run( cmd="ssh {netloc} 'cd {path};{cmd}'".format(netloc=url_info.netloc, path=url_info.path, cmd=mkdir_cmd), outs=[(url_info / "dir-out").url], deps=["foo"], # add a fake dep to not consider this a callback ) repo.reproduce("dir-out.dvc") repo.reproduce("dir-out.dvc", force=True)
def test_get_url_external(repo_dir, dvc_repo, erepo, remote_url): _set_remote_url_and_commit(erepo.dvc, remote_url) # Using file url to force clone to tmp repo repo_url = "file://" + erepo.dvc.root_dir expected_url = URLInfo(remote_url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" assert api.get_url(repo_dir.FOO, repo=repo_url) == expected_url
def test_init_git(tmp_dir, scm, cloud): tmp_dir.scm_gen({"foo": "foo", "dir": {"bar": "bar"}}, commit="init") rev = scm.get_rev() scm.set_ref(EXEC_HEAD, rev) tmp_dir.gen("foo", "stashed") scm.gitpython.git.stash() rev = scm.resolve_rev("stash@{0}") scm.set_ref(EXEC_MERGE, rev) root_url = URLInfo(str(cloud)) / SSHExecutor.gen_dirname() executor = SSHExecutor( scm, ".", root_dir=root_url.path, host=root_url.host, port=root_url.port, username=TEST_SSH_USER, fs_factory=partial(_ssh_factory, cloud), ) assert root_url.path == executor._repo_abspath fs = cloud._ssh assert fs.exists(posixpath.join(executor._repo_abspath, "foo")) assert fs.exists(posixpath.join(executor._repo_abspath, "dir")) assert fs.exists(posixpath.join(executor._repo_abspath, "dir", "bar"))
def test_get_url_external(remote_url, erepo_dir): _set_remote_url_and_commit(erepo_dir.dvc, remote_url) # Using file url to force clone to tmp repo repo_url = "file://{}".format(erepo_dir) expected_url = URLInfo(remote_url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" assert api.get_url("foo", repo=repo_url) == expected_url
def test_exists(mocker): import io import requests from dvc.path_info import URLInfo res = requests.Response() # need to add `raw`, as `exists()` fallbacks to a streaming GET requests # on HEAD request failure. res.raw = io.StringIO("foo") fs = HTTPFileSystem(None, {}) mocker.patch.object(fs, "request", return_value=res) url = URLInfo("https://example.org/file.txt") res.status_code = 200 assert fs.exists(url) is True res.status_code = 404 assert fs.exists(url) is False res.status_code = 403 with pytest.raises(HTTPError): fs.exists(url)
def test_get_url_granular(tmp_dir, dvc, s3): tmp_dir.add_remote(config=s3.config) tmp_dir.dvc_gen( {"dir": {"foo": "foo", "bar": "bar", "nested": {"file": "file"}}} ) expected_url = URLInfo(s3.url) / "5f/c28ea78987408341668eba6525ebd1.dir" assert api.get_url("dir") == expected_url expected_url = URLInfo(s3.url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" assert api.get_url("dir/foo") == expected_url expected_url = URLInfo(s3.url) / "37/b51d194a7513e45b56f6524f2d51f2" assert api.get_url("dir/bar") == expected_url expected_url = URLInfo(s3.url) / "8c/7dd922ad47494fc02c388e12c00eac" assert api.get_url(os.path.join("dir", "nested", "file")) == expected_url
def test_get_url(repo_dir, dvc_repo, remote): remote_url = remote.get_url() run_dvc("remote", "add", "-d", "upstream", remote_url) dvc_repo.add(repo_dir.FOO) expected_url = URLInfo(remote_url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" assert api.get_url(repo_dir.FOO) == expected_url
def test_get_url_external(tmp_dir, erepo_dir, cloud): erepo_dir.add_remote(config=cloud.config) with erepo_dir.chdir(): erepo_dir.dvc_gen("foo", "foo", commit="add foo") # Using file url to force clone to tmp repo repo_url = f"file://{erepo_dir}" expected_url = URLInfo(cloud.url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" assert api.get_url("foo", repo=repo_url) == expected_url
def test_get_url_external(erepo_dir, remote_url, setup_remote): setup_remote(erepo_dir.dvc, url=remote_url) with erepo_dir.chdir(): erepo_dir.dvc_gen("foo", "foo", commit="add foo") # Using file url to force clone to tmp repo repo_url = f"file://{erepo_dir}" expected_url = URLInfo(remote_url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" assert api.get_url("foo", repo=repo_url) == expected_url
def test_download_fails_on_error_code(dvc): with StaticFileServer() as httpd: url = "http://localhost:{}/".format(httpd.server_port) config = {"url": url} remote = HTTPRemote(dvc, config) with pytest.raises(HTTPError): remote._download(URLInfo(url) / "missing.txt", "missing.txt")
def test_download_fails_on_error_code(dvc): with StaticFileServer() as httpd: url = f"http://localhost:{httpd.server_port}/" config = {"url": url} tree = HTTPRemoteTree(dvc, config) with pytest.raises(HTTPError): tree._download(URLInfo(url) / "missing.txt", "missing.txt")
def test_content_length(mocker, headers, expected_size): res = requests.Response() res.headers.update(headers) res.status_code = 200 fs = HTTPFileSystem() mocker.patch.object(fs, "request", return_value=res) url = URLInfo("https://example.org/file.txt") assert fs.info(url) == {"etag": None, "size": expected_size} assert fs._content_length(res) == expected_size
def test_ssh_dir_out(tmp_dir, dvc, ssh_server): tmp_dir.gen({"foo": "foo content"}) # Set up remote and cache user = ssh_server.test_creds["username"] port = ssh_server.port keyfile = ssh_server.test_creds["key_filename"] remote_url = SSHMocked.get_url(user, port) assert main(["remote", "add", "upstream", remote_url]) == 0 assert main(["remote", "modify", "upstream", "keyfile", keyfile]) == 0 cache_url = SSHMocked.get_url(user, port) assert main(["remote", "add", "sshcache", cache_url]) == 0 assert main(["config", "cache.ssh", "sshcache"]) == 0 assert main(["remote", "modify", "sshcache", "keyfile", keyfile]) == 0 # Recreating to reread configs repo = DvcRepo(dvc.root_dir) # To avoid "WARNING: UNPROTECTED PRIVATE KEY FILE" from ssh os.chmod(keyfile, 0o600) (tmp_dir / "script.py").write_text( "import sys, pathlib\n" "path = pathlib.Path(sys.argv[1])\n" "dir_out = path / 'dir-out'\n" "dir_out.mkdir()\n" "(dir_out / '1.txt').write_text('1')\n" "(dir_out / '2.txt').write_text('2')\n" ) url_info = URLInfo(remote_url) repo.run( cmd="python {} {}".format(tmp_dir / "script.py", url_info.path), outs=["remote://upstream/dir-out"], deps=["foo"], # add a fake dep to not consider this a callback ) repo.reproduce("dir-out.dvc") repo.reproduce("dir-out.dvc", force=True)
def test_init_cache(tmp_dir, dvc, scm, cloud): foo = tmp_dir.dvc_gen("foo", "foo", commit="init")[0].outs[0] rev = scm.get_rev() scm.set_ref(EXEC_HEAD, rev) scm.set_ref(EXEC_MERGE, rev) root_url = URLInfo(str(cloud)) / SSHExecutor.gen_dirname() executor = SSHExecutor( scm, ".", root_dir=root_url.path, host=root_url.host, port=root_url.port, username=TEST_SSH_USER, fs_factory=partial(_ssh_factory, cloud), ) executor.init_cache(dvc, rev) fs = cloud._ssh foo_hash = foo.hash_info.value assert fs.exists( posixpath.join(executor._repo_abspath, ".dvc", "cache", foo_hash[:2], foo_hash[2:]))
def test_get_url(tmp_dir, dvc, remote): tmp_dir.dvc_gen("foo", "foo") expected_url = URLInfo(remote.url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" assert api.get_url("foo") == expected_url
def test_get_url(remote_url, tmp_dir, dvc, repo_template): run_dvc("remote", "add", "-d", "upstream", remote_url) dvc.add("foo") expected_url = URLInfo(remote_url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" assert api.get_url("foo") == expected_url
def test_get_url(tmp_dir, dvc, remote_url): run_dvc("remote", "add", "-d", "upstream", remote_url) tmp_dir.dvc_gen("foo", "foo") expected_url = URLInfo(remote_url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" assert api.get_url("foo") == expected_url