Exemple #1
0
def test_put_file(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    lfs = LocalFileSystem()

    fs.mkdir("putdir")

    # Check that put on an empty file works
    with open("sample.txt", "wb") as f:
        f.write(b"")
    fs.put("sample.txt", "putdir/sample.txt")
    fs.get("putdir/sample.txt", "sample2.txt")

    with open("sample.txt", "rb") as f:
        f1 = f.read()
    with open("sample2.txt", "rb") as f:
        f2 = f.read()
    assert f1 == f2

    lfs.rm("sample.txt")
    lfs.rm("sample2.txt")

    # Check that put on a file with data works
    with open("sample3.txt", "wb") as f:
        f.write(b"01234567890")
    fs.put("sample3.txt", "putdir/sample3.txt")
    fs.get("putdir/sample3.txt", "sample4.txt")
    with open("sample3.txt", "rb") as f:
        f3 = f.read()
    with open("sample4.txt", "rb") as f:
        f4 = f.read()
    assert f3 == f4
    fs.rm("putdir", recursive=True)
Exemple #2
0
def test_mkdir_rm_recursive(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)

    fs.mkdir("test_mkdir_rm_recursive")
    assert "test_mkdir_rm_recursive/" in fs.ls("")

    with fs.open("test_mkdir_rm_recursive/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("test_mkdir_rm_recursive/dir/file.txt", "wb") as f:
        f.write(b"ABCD")

    with fs.open("test_mkdir_rm_recursive/dir/file2.txt", "wb") as f:
        f.write(b"abcdef")

    assert fs.find("test_mkdir_rm_recursive") == [
        "test_mkdir_rm_recursive/dir/file.txt",
        "test_mkdir_rm_recursive/dir/file2.txt",
        "test_mkdir_rm_recursive/file.txt",
    ]

    fs.rm("test_mkdir_rm_recursive", recursive=True)

    assert "test_mkdir_rm_recursive/" not in fs.ls("")
    assert fs.find("test_mkdir_rm_recursive") == []
Exemple #3
0
def test_mkdir(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR,
    )

    # Verify mkdir will create a new container when create_parents is True
    fs.mkdir("new-container", create_parents=True)
    assert "new-container" in fs.ls(".")
    fs.rm("new-container")

    # Verify a new container will not be created when create_parents
    # is False
    with pytest.raises(PermissionError):
        fs.mkdir("new-container", create_parents=False)

    # Test creating subdirectory when container does not exist
    # Since mkdir is a no-op, if create_parents=True, it will create
    # the top level container, but will NOT create nested directories
    fs.mkdir("new-container/dir", create_parents=True)
    assert "new-container/dir" not in fs.ls("new-container")
    assert "new-container" in fs.ls(".")
    fs.rm("new-container", recursive=True)

    # Test that creating a directory when already exists passes
    fs.mkdir("data")
    assert "data" in fs.ls(".")

    # Test raising error when container does not exist
    with pytest.raises(PermissionError):
        fs.mkdir("new-container/dir", create_parents=False)
Exemple #4
0
def test_metadata_write(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("test_metadata_write")
    data = b"0123456789"
    metadata = {"meta": "data"}

    # standard blob type
    with fs.open("test_metadata_write/file.txt", "wb", metadata=metadata) as f:
        f.write(data)
    info = fs.info("test_metadata_write/file.txt")
    assert info["metadata"] == metadata
    metadata_changed_on_write = {"meta": "datum"}
    with fs.open("test_metadata_write/file.txt",
                 "wb",
                 metadata=metadata_changed_on_write) as f:
        f.write(data)
    info = fs.info("test_metadata_write/file.txt")
    assert info["metadata"] == metadata_changed_on_write

    # append blob type
    new_metadata = {"data": "meta"}
    with fs.open("test_metadata_write/append-file.txt",
                 "ab",
                 metadata=metadata) as f:
        f.write(data)

    # try change metadata on block appending
    with fs.open("test_metadata_write/append-file.txt",
                 "ab",
                 metadata=new_metadata) as f:
        f.write(data)
    info = fs.info("test_metadata_write/append-file.txt")

    # azure blob client doesn't seem to support metadata mutation when appending blocks
    # lets be sure this behavior doesn't change as this would imply
    # a potential breaking change
    assert info["metadata"] == metadata

    # getxattr / setxattr
    assert fs.getxattr("test_metadata_write/file.txt", "meta") == "datum"
    fs.setxattrs("test_metadata_write/file.txt", metadata="data2")
    assert fs.getxattr("test_metadata_write/file.txt", "metadata") == "data2"
    assert fs.info("test_metadata_write/file.txt")["metadata"] == {
        "metadata": "data2"
    }

    # empty file and nested directory
    with fs.open("test_metadata_write/a/b/c/nested-file.txt",
                 "wb",
                 metadata=metadata) as f:
        f.write(b"")
    assert fs.getxattr("test_metadata_write/a/b/c/nested-file.txt",
                       "meta") == "data"
    fs.setxattrs("test_metadata_write/a/b/c/nested-file.txt", metadata="data2")
    assert fs.info(
        "test_metadata_write/a/b/c/nested-file.txt")["metadata"] == {
            "metadata": "data2"
        }
    fs.rmdir("test_metadata_write")
Exemple #5
0
def test_cat(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("catdir")
    data = b"0123456789"
    with fs.open("catdir/catfile.txt", "wb") as f:
        f.write(data)
    assert fs.cat("catdir/catfile.txt") == data
    fs.rm("catdir/catfile.txt")
Exemple #6
0
def test_mkdir_rmdir(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )

    fs.mkdir("new-container")
    assert "new-container/" in fs.ls("")
    assert fs.ls("new-container") == []

    with fs.open(path="new-container/file.txt", mode="wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file2.txt", "wb") as f:
        f.write(b"0123456789")

    # Check to verify you can skip making a directory if the container
    # already exists, but still create a file in that directory
    fs.mkdir("new-container/dir/file.txt", exist_ok=False)
    assert "new-container/" in fs.ls("")

    fs.mkdir("new-container/file2.txt", exist_ok=False)
    assert "new-container/file2.txt" in fs.ls("new-container")

    # Test to verify that the file contains expected contents
    with fs.open("new-container/file2.txt", "rb") as f:
        outfile = f.read()
    assert outfile == b""

    # Check that trying to overwrite an existing nested file in append mode works as expected
    fs.mkdir("new-container/dir/file2.txt", exist_ok=False)
    assert "new-container/dir/file2.txt" in fs.ls("new-container/dir")

    # Also verify you can make a nested directory structure
    fs.mkdir("new-container/dir2/file.txt", exist_ok=False)
    with fs.open("new-container/dir2/file.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2")
    fs.rm("new-container/dir2", recursive=True)

    fs.rm("new-container/dir", recursive=True)
    assert fs.ls("new-container") == [
        "new-container/file.txt",
        "new-container/file2.txt",
    ]

    fs.rm("new-container/file.txt")
    fs.rm("new-container/file2.txt")
    fs.rmdir("new-container")

    assert "new-container/" not in fs.ls("")
Exemple #7
0
def test_cp_file(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("homedir")
    fs.mkdir("homedir/enddir")
    fs.touch("homedir/startdir/test_file.txt")
    fs.cp_file("homedir/startdir/test_file.txt",
               "homedir/enddir/test_file.txt")
    files = fs.ls("homedir/enddir")
    assert "homedir/enddir/test_file.txt" in files

    fs.rm("homedir", recursive=True)
Exemple #8
0
def test_mkdir_rmdir(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )

    fs.mkdir("new-container")
    assert "new-container/" in fs.ls("")
    assert fs.ls("new-container") == []

    with fs.open(path="new-container/file.txt", mode="wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file2.txt", "wb") as f:
        f.write(b"0123456789")

    # Check to verify you can skip making a directory if the container
    # already exists, but still create a file in that directory
    fs.mkdir("new-container/dir/file.txt", exists_ok=True)
    assert "new-container/" in fs.ls("")

    fs.mkdir("new-container/file2.txt", exists_ok=True)
    with fs.open("new-container/file2.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/file2.txt" in fs.ls("new-container")

    fs.mkdir("new-container/dir/file2.txt", exists_ok=True)
    with fs.open("new-container/dir/file2.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir/file2.txt" in fs.ls("new-container/dir")

    # Also verify you can make a nested directory structure
    fs.mkdir("new-container/dir2/file.txt", exists_ok=True)
    with fs.open("new-container/dir2/file.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2")
    fs.rm("new-container/dir2", recursive=True)

    fs.rm("new-container/dir", recursive=True)
    assert fs.ls("new-container") == [
        "new-container/file.txt",
        "new-container/file2.txt",
    ]

    fs.rm("new-container/file.txt")
    fs.rm("new-container/file2.txt")
    fs.rmdir("new-container")

    assert "new-container/" not in fs.ls("")
Exemple #9
0
def test_append_operation(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("append-container")

    # Check that appending to an existing file works as expected
    with fs.open("append-container/append_file.txt", "ab") as f:
        f.write(b"0123456789")
    with fs.open("append-container/append_file.txt", "ab") as f:
        f.write(b"0123456789")
    with fs.open("new-container/dir/file2.txt", "rb") as f:
        outfile = f.read()
    assert outfile == b"01234567890123456789"

    fs.rm("append-container", recursive=True)
Exemple #10
0
def test_url(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR, account_key=KEY
    )
    fs.mkdir("catdir")
    data = b"0123456789"
    with fs.open("catdir/catfile.txt", "wb") as f:
        f.write(data)

    import requests

    r = requests.get(fs.url("catdir/catfile.txt"))
    assert r.status_code == 200
    assert r.content == data

    fs.rm("catdir/catfile.txt")
Exemple #11
0
    def fs(self):
        from adlfs import AzureBlobFileSystem
        from azure.core.exceptions import AzureError

        try:
            file_system = AzureBlobFileSystem(**self.login_info)
            if self.bucket not in [
                    container.rstrip("/") for container in file_system.ls("/")
            ]:
                file_system.mkdir(self.bucket)
        except (ValueError, AzureError) as e:
            raise AzureAuthError(
                f"Authentication to Azure Blob Storage via {self.login_method}"
                " failed.\nLearn more about configuration settings at"
                f" {format_link('https://man.dvc.org/remote/modify')}") from e

        return file_system
Exemple #12
0
def test_dask_parquet(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("test")
    STORAGE_OPTIONS = {
        "account_name": "devstoreaccount1",
        "connection_string": CONN_STR,
    }
    df = pd.DataFrame({
        "col1": [1, 2, 3, 4],
        "col2": [2, 4, 6, 8],
        "index_key": [1, 1, 2, 2],
        "partition_key": [1, 1, 2, 2],
    })

    dask_dataframe = dd.from_pandas(df, npartitions=1)
    for protocol in ["abfs", "az"]:
        dask_dataframe.to_parquet(
            "{}://[email protected]/test_group.parquet".format(
                protocol),
            storage_options=STORAGE_OPTIONS,
            engine="pyarrow",
        )

        fs = AzureBlobFileSystem(**STORAGE_OPTIONS)
        assert fs.ls("test/test_group.parquet") == [
            "test/test_group.parquet/_common_metadata",
            "test/test_group.parquet/_metadata",
            "test/test_group.parquet/part.0.parquet",
        ]
        fs.rm("test/test_group.parquet")

    df_test = dd.read_parquet(
        "abfs://test/test_group.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df, df_test)
Exemple #13
0
def test_deep_paths(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)

    fs.mkdir("test_deep")
    assert "test_deep/" in fs.ls("")

    with fs.open("test_deep/a/b/c/file.txt", "wb") as f:
        f.write(b"0123456789")

    assert fs.ls("test_deep") == ["test_deep/a/"]
    assert fs.ls("test_deep/") == ["test_deep/a/"]
    assert fs.ls("test_deep/a") == ["test_deep/a/b/"]
    assert fs.ls("test_deep/a/") == ["test_deep/a/b/"]
    assert fs.find("test_deep") == ["test_deep/a/b/c/file.txt"]
    assert fs.find("test_deep/") == ["test_deep/a/b/c/file.txt"]
    assert fs.find("test_deep/a") == ["test_deep/a/b/c/file.txt"]
    assert fs.find("test_deep/a/") == ["test_deep/a/b/c/file.txt"]

    fs.rm("test_deep", recursive=True)

    assert "test_deep/" not in fs.ls("")
    assert fs.find("test_deep") == []
Exemple #14
0
def test_mkdir(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )

    # Verify mkdir will create a new container when create_parents is True
    fs.mkdir("new-container", create_parents=True)
    assert "new-container/" in fs.ls(".")
    fs.rm("new-container")

    # Verify a new container will not be created when create_parents
    # is False
    with pytest.raises(PermissionError):
        fs.mkdir("new-container", create_parents=False)

    # Test creating subdirectory when container does not exist
    fs.mkdir("new-container/dir", create_parents=True)
    assert "new-container/dir" in fs.ls("new-container")
    fs.rm("new-container", recursive=True)

    # Test raising error when container does not exist
    with pytest.raises(PermissionError):
        fs.mkdir("new-container/dir", create_parents=False)
Exemple #15
0
def test_dask_parquet(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("test")
    STORAGE_OPTIONS = {
        "account_name": "devstoreaccount1",
        "connection_string": CONN_STR,
    }
    df = pd.DataFrame({
        "col1": [1, 2, 3, 4],
        "col2": [2, 4, 6, 8],
        "index_key": [1, 1, 2, 2],
        "partition_key": [1, 1, 2, 2],
    })

    dask_dataframe = dd.from_pandas(df, npartitions=1)
    for protocol in ["abfs", "az"]:
        dask_dataframe.to_parquet(
            "{}://test/test_group.parquet".format(protocol),
            storage_options=STORAGE_OPTIONS,
            engine="pyarrow",
        )

        fs = AzureBlobFileSystem(**STORAGE_OPTIONS)
        assert fs.ls("test/test_group.parquet") == [
            "test/test_group.parquet/_common_metadata",
            "test/test_group.parquet/_metadata",
            "test/test_group.parquet/part.0.parquet",
        ]
        fs.rm("test/test_group.parquet")

    df_test = dd.read_parquet(
        "abfs://test/test_group.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df, df_test)

    A = np.random.randint(0, 100, size=(10000, 4))
    df2 = pd.DataFrame(data=A, columns=list("ABCD"))
    ddf2 = dd.from_pandas(df2, npartitions=4)
    dd.to_parquet(
        ddf2,
        "abfs://test/test_group2.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    )
    assert fs.ls("test/test_group2.parquet") == [
        "test/test_group2.parquet/_common_metadata",
        "test/test_group2.parquet/_metadata",
        "test/test_group2.parquet/part.0.parquet",
        "test/test_group2.parquet/part.1.parquet",
        "test/test_group2.parquet/part.2.parquet",
        "test/test_group2.parquet/part.3.parquet",
    ]
    df2_test = dd.read_parquet(
        "abfs://test/test_group2.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df2, df2_test)

    a = np.full(shape=(10000, 1), fill_value=1)
    b = np.full(shape=(10000, 1), fill_value=2)
    c = np.full(shape=(10000, 1), fill_value=3)
    d = np.full(shape=(10000, 1), fill_value=4)
    B = np.concatenate((a, b, c, d), axis=1)
    df3 = pd.DataFrame(data=B, columns=list("ABCD"))
    ddf3 = dd.from_pandas(df3, npartitions=4)
    dd.to_parquet(
        ddf3,
        "abfs://test/test_group3.parquet",
        partition_on=["A", "B"],
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    )
    assert fs.glob("test/test_group3.parquet/*") == [
        "test/test_group3.parquet/A=1",
        "test/test_group3.parquet/_common_metadata",
        "test/test_group3.parquet/_metadata",
    ]
    df3_test = dd.read_parquet(
        "abfs://test/test_group3.parquet",
        filters=[("A", "=", 1)],
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    df3_test = df3_test[["A", "B", "C", "D"]]
    df3_test = df3_test[["A", "B", "C", "D"]].astype(int)
    assert_frame_equal(df3, df3_test)

    A = np.random.randint(0, 100, size=(10000, 4))
    df4 = pd.DataFrame(data=A, columns=list("ABCD"))
    ddf4 = dd.from_pandas(df4, npartitions=4)
    dd.to_parquet(
        ddf4,
        "abfs://test/test_group4.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
        flavor="spark",
        write_statistics=False,
    )
    fs.rmdir("test/test_group4.parquet/_common_metadata", recursive=True)
    fs.rmdir("test/test_group4.parquet/_metadata", recursive=True)
    fs.rm("test/test_group4.parquet/_common_metadata")
    fs.rm("test/test_group4.parquet/_metadata")
    assert fs.ls("test/test_group4.parquet") == [
        "test/test_group4.parquet/part.0.parquet",
        "test/test_group4.parquet/part.1.parquet",
        "test/test_group4.parquet/part.2.parquet",
        "test/test_group4.parquet/part.3.parquet",
    ]
    df4_test = dd.read_parquet(
        "abfs://test/test_group4.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df4, df4_test)

    A = np.random.randint(0, 100, size=(10000, 4))
    df5 = pd.DataFrame(data=A, columns=list("ABCD"))
    ddf5 = dd.from_pandas(df5, npartitions=4)
    dd.to_parquet(
        ddf5,
        "abfs://test/test group5.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    )
    assert fs.ls("test/test group5.parquet") == [
        "test/test group5.parquet/_common_metadata",
        "test/test group5.parquet/_metadata",
        "test/test group5.parquet/part.0.parquet",
        "test/test group5.parquet/part.1.parquet",
        "test/test group5.parquet/part.2.parquet",
        "test/test group5.parquet/part.3.parquet",
    ]
    df5_test = dd.read_parquet(
        "abfs://test/test group5.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df5, df5_test)
Exemple #16
0
def test_large_blob(storage):
    import tempfile
    import hashlib
    import io
    import shutil
    from pathlib import Path

    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)

    # create a 20MB byte array, ensure it's larger than blocksizes to force a
    # chuncked upload
    blob_size = 120_000_000
    # blob_size = 2_684_354_560
    assert blob_size > fs.blocksize
    assert blob_size > AzureBlobFile.DEFAULT_BLOCK_SIZE

    data = b"1" * blob_size
    _hash = hashlib.md5(data)
    expected = _hash.hexdigest()

    # create container
    fs.mkdir("chunk-container")

    # upload the data using fs.open
    path = "chunk-container/large-blob.bin"
    with fs.open(path, "ab") as dst:
        dst.write(data)

    assert fs.exists(path)
    assert fs.size(path) == blob_size

    del data

    # download with fs.open
    bio = io.BytesIO()
    with fs.open(path, "rb") as src:
        shutil.copyfileobj(src, bio)

    # read back the data and calculate md5
    bio.seek(0)
    data = bio.read()
    _hash = hashlib.md5(data)
    result = _hash.hexdigest()

    assert expected == result

    # do the same but using upload/download and a tempdir
    path = path = "chunk-container/large_blob2.bin"
    with tempfile.TemporaryDirectory() as td:
        local_blob: Path = Path(td) / "large_blob2.bin"
        with local_blob.open("wb") as fo:
            fo.write(data)
        assert local_blob.exists()
        assert local_blob.stat().st_size == blob_size

        fs.upload(str(local_blob), path)
        assert fs.exists(path)
        assert fs.size(path) == blob_size

        # download now
        local_blob.unlink()
        fs.download(path, str(local_blob))
        assert local_blob.exists()
        assert local_blob.stat().st_size == blob_size