def test_mkdir_rm_recursive(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_mkdir_rm_recursive") assert "test_mkdir_rm_recursive/" in fs.ls("") with fs.open("test_mkdir_rm_recursive/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("test_mkdir_rm_recursive/dir/file.txt", "wb") as f: f.write(b"ABCD") with fs.open("test_mkdir_rm_recursive/dir/file2.txt", "wb") as f: f.write(b"abcdef") assert fs.find("test_mkdir_rm_recursive") == [ "test_mkdir_rm_recursive/dir/file.txt", "test_mkdir_rm_recursive/dir/file2.txt", "test_mkdir_rm_recursive/file.txt", ] fs.rm("test_mkdir_rm_recursive", recursive=True) assert "test_mkdir_rm_recursive/" not in fs.ls("") assert fs.find("test_mkdir_rm_recursive") == []
def test_metadata_write(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_metadata_write") data = b"0123456789" metadata = {"meta": "data"} # standard blob type with fs.open("test_metadata_write/file.txt", "wb", metadata=metadata) as f: f.write(data) info = fs.info("test_metadata_write/file.txt") assert info["metadata"] == metadata metadata_changed_on_write = {"meta": "datum"} with fs.open("test_metadata_write/file.txt", "wb", metadata=metadata_changed_on_write) as f: f.write(data) info = fs.info("test_metadata_write/file.txt") assert info["metadata"] == metadata_changed_on_write # append blob type new_metadata = {"data": "meta"} with fs.open("test_metadata_write/append-file.txt", "ab", metadata=metadata) as f: f.write(data) # try change metadata on block appending with fs.open("test_metadata_write/append-file.txt", "ab", metadata=new_metadata) as f: f.write(data) info = fs.info("test_metadata_write/append-file.txt") # azure blob client doesn't seem to support metadata mutation when appending blocks # lets be sure this behavior doesn't change as this would imply # a potential breaking change assert info["metadata"] == metadata # getxattr / setxattr assert fs.getxattr("test_metadata_write/file.txt", "meta") == "datum" fs.setxattrs("test_metadata_write/file.txt", metadata="data2") assert fs.getxattr("test_metadata_write/file.txt", "metadata") == "data2" assert fs.info("test_metadata_write/file.txt")["metadata"] == { "metadata": "data2" } # empty file and nested directory with fs.open("test_metadata_write/a/b/c/nested-file.txt", "wb", metadata=metadata) as f: f.write(b"") assert fs.getxattr("test_metadata_write/a/b/c/nested-file.txt", "meta") == "data" fs.setxattrs("test_metadata_write/a/b/c/nested-file.txt", metadata="data2") assert fs.info( "test_metadata_write/a/b/c/nested-file.txt")["metadata"] == { "metadata": "data2" } fs.rmdir("test_metadata_write")
def setup(self, stage=None): data_dir = "datasets/mnist" storage_options = {"account_name": "azuremlexamples"} fs = AzureBlobFileSystem(**storage_options) files = fs.ls(data_dir) train_len = 60000 test_len = 10000 for f in files: if "train-images" in f: self.X_train = self._read_images(gzip.open(fs.open(f)), train_len) elif "train-labels" in f: self.y_train = self._read_labels(gzip.open(fs.open(f)), train_len) elif "images" in f: self.X_test = self._read_images(gzip.open(fs.open(f)), test_len) elif "labels" in f: self.y_test = self._read_labels(gzip.open(fs.open(f)), test_len) self.ohe = OneHotEncoder().fit(self.y_train.reshape(-1, 1)) self.mnist_train = list( zip( self.X_train, self.ohe.transform(self.y_train.reshape(-1, 1)).toarray(), ) ) self.mnist_test = list( zip(self.X_test, self.ohe.transform(self.y_test.reshape(-1, 1)).toarray(),) )
def test_mkdir_rmdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.mkdir("new-container") assert "new-container/" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Check to verify you can skip making a directory if the container # already exists, but still create a file in that directory fs.mkdir("new-container/dir/file.txt", exist_ok=False) assert "new-container/" in fs.ls("") fs.mkdir("new-container/file2.txt", exist_ok=False) assert "new-container/file2.txt" in fs.ls("new-container") # Test to verify that the file contains expected contents with fs.open("new-container/file2.txt", "rb") as f: outfile = f.read() assert outfile == b"" # Check that trying to overwrite an existing nested file in append mode works as expected fs.mkdir("new-container/dir/file2.txt", exist_ok=False) assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure fs.mkdir("new-container/dir2/file.txt", exist_ok=False) with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container/" not in fs.ls("")
def test_makedir_rmdir(storage, caplog): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.makedir("new-container") assert "new-container" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Verify that mkdir will raise an exception if the directory exists # and exist_ok is False with pytest.raises(FileExistsError): fs.makedir("new-container/dir/file.txt", exist_ok=False) # mkdir should raise an error if the container exists and # we try to create a nested directory, with exist_ok=False with pytest.raises(FileExistsError): fs.makedir("new-container/dir2", exist_ok=False) # Check that trying to overwrite an existing nested file in append mode works as expected # if exist_ok is True fs.makedir("new-container/dir/file2.txt", exist_ok=True) assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) fs.touch("new-container/file2.txt") assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container" not in fs.ls("")
def test_append_operation(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("append-container") # Check that appending to an existing file works as expected with fs.open("append-container/append_file.txt", "ab") as f: f.write(b"0123456789") with fs.open("append-container/append_file.txt", "ab") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "rb") as f: outfile = f.read() assert outfile == b"01234567890123456789" fs.rm("append-container", recursive=True)
def test_fetch_second_half(storage): # Verify if length extends beyond the end of file, truncate the read fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) blob = fs.open("data/top_file.txt") assert len(blob._fetch_range(start=5, end=10)) == 5
def test_open_file(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) f = fs.open("/data/root/a/file.txt") result = f.read() assert result == b"0123456789"
def test_cat(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("catdir") data = b"0123456789" with fs.open("catdir/catfile.txt", "wb") as f: f.write(data) assert fs.cat("catdir/catfile.txt") == data fs.rm("catdir/catfile.txt")
def test_open_context_manager(storage, mocker): "test closing azure client with context manager" fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) with fs.open("/data/root/a/file.txt") as f: close = mocker.patch.object(f.container_client, "close") result = f.read() assert result == b"0123456789" close.assert_called_once()
def test_open_file(storage, mocker): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) f = fs.open("/data/root/a/file.txt") result = f.read() assert result == b"0123456789" close = mocker.patch.object(f.container_client, "close") f.close() close.assert_called_once()
def test_url(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, account_key=KEY ) fs.mkdir("catdir") data = b"0123456789" with fs.open("catdir/catfile.txt", "wb") as f: f.write(data) import requests r = requests.get(fs.url("catdir/catfile.txt")) assert r.status_code == 200 assert r.content == data fs.rm("catdir/catfile.txt")
def test_mkdir_rmdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.mkdir("new-container") assert "new-container/" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Check to verify you can skip making a directory if the container # already exists, but still create a file in that directory fs.mkdir("new-container/dir/file.txt", exists_ok=True) assert "new-container/" in fs.ls("") fs.mkdir("new-container/file2.txt", exists_ok=True) with fs.open("new-container/file2.txt", "wb") as f: f.write(b"0123456789") assert "new-container/file2.txt" in fs.ls("new-container") fs.mkdir("new-container/dir/file2.txt", exists_ok=True) with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure fs.mkdir("new-container/dir2/file.txt", exists_ok=True) with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container/" not in fs.ls("")
def test_deep_paths(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_deep") assert "test_deep/" in fs.ls("") with fs.open("test_deep/a/b/c/file.txt", "wb") as f: f.write(b"0123456789") assert fs.ls("test_deep") == ["test_deep/a/"] assert fs.ls("test_deep/") == ["test_deep/a/"] assert fs.ls("test_deep/a") == ["test_deep/a/b/"] assert fs.ls("test_deep/a/") == ["test_deep/a/b/"] assert fs.find("test_deep") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/a") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/a/") == ["test_deep/a/b/c/file.txt"] fs.rm("test_deep", recursive=True) assert "test_deep/" not in fs.ls("") assert fs.find("test_deep") == []
def test_fetch_first_half(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) blob = fs.open("data/top_file.txt") assert len(blob._fetch_range(start=0, end=5)) == 5
def test_large_blob(storage): import tempfile import hashlib import io import shutil from pathlib import Path fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) # create a 20MB byte array, ensure it's larger than blocksizes to force a # chuncked upload blob_size = 120_000_000 # blob_size = 2_684_354_560 assert blob_size > fs.blocksize assert blob_size > AzureBlobFile.DEFAULT_BLOCK_SIZE data = b"1" * blob_size _hash = hashlib.md5(data) expected = _hash.hexdigest() # create container fs.mkdir("chunk-container") # upload the data using fs.open path = "chunk-container/large-blob.bin" with fs.open(path, "ab") as dst: dst.write(data) assert fs.exists(path) assert fs.size(path) == blob_size del data # download with fs.open bio = io.BytesIO() with fs.open(path, "rb") as src: shutil.copyfileobj(src, bio) # read back the data and calculate md5 bio.seek(0) data = bio.read() _hash = hashlib.md5(data) result = _hash.hexdigest() assert expected == result # do the same but using upload/download and a tempdir path = path = "chunk-container/large_blob2.bin" with tempfile.TemporaryDirectory() as td: local_blob: Path = Path(td) / "large_blob2.bin" with local_blob.open("wb") as fo: fo.write(data) assert local_blob.exists() assert local_blob.stat().st_size == blob_size fs.upload(str(local_blob), path) assert fs.exists(path) assert fs.size(path) == blob_size # download now local_blob.unlink() fs.download(path, str(local_blob)) assert local_blob.exists() assert local_blob.stat().st_size == blob_size
def test_fetch_length_is_none(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) blob = fs.open("data/top_file.txt") assert len(blob._fetch_range(start=2, end=None)) == 8
def test_makedir_rmdir(storage, caplog): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.makedir("new-container") assert "new-container/" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Verify that mkdir will raise an exception if the directory exists # and exist_ok is False with pytest.raises(FileExistsError): fs.makedir("new-container/dir/file.txt", exist_ok=False) # Verify that mkdir creates a directory if exist_ok is False and the # directory does not exist fs.makedir("new-container/file2.txt", exist_ok=False) assert "new-container/file2.txt" in fs.ls("new-container") # Verify that mkdir will silently ignore an existing directory if # the directory exists and exist_ok is True fs.makedir("new-container/dir", exist_ok=True) assert "new-container/dir/" in fs.ls("new-container") # Test to verify that the file contains expected contents with fs.open("new-container/file2.txt", "rb") as f: outfile = f.read() assert outfile == b"" # Check that trying to overwrite an existing nested file in append mode works as expected # if exist_ok is True fs.makedir("new-container/dir/file2.txt", exist_ok=True) assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure fs.makedir("new-container/dir2/file.txt", exist_ok=False) with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container/" not in fs.ls("")
def test_fetch_entire_blob(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) blob = fs.open("data/top_file.txt") assert len(blob._fetch_range(start=0, length=10)) == 10
def test_connect_async_open_credential(): fs = AzureBlobFileSystem(account_name=storage.account_name, credential=DefaultAzureCredential()) fs.open(path="")