Exemple #1
0
def test_mkdir_rm_recursive(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)

    fs.mkdir("test_mkdir_rm_recursive")
    assert "test_mkdir_rm_recursive/" in fs.ls("")

    with fs.open("test_mkdir_rm_recursive/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("test_mkdir_rm_recursive/dir/file.txt", "wb") as f:
        f.write(b"ABCD")

    with fs.open("test_mkdir_rm_recursive/dir/file2.txt", "wb") as f:
        f.write(b"abcdef")

    assert fs.find("test_mkdir_rm_recursive") == [
        "test_mkdir_rm_recursive/dir/file.txt",
        "test_mkdir_rm_recursive/dir/file2.txt",
        "test_mkdir_rm_recursive/file.txt",
    ]

    fs.rm("test_mkdir_rm_recursive", recursive=True)

    assert "test_mkdir_rm_recursive/" not in fs.ls("")
    assert fs.find("test_mkdir_rm_recursive") == []
Exemple #2
0
def test_mkdir(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR,
    )

    # Verify mkdir will create a new container when create_parents is True
    fs.mkdir("new-container", create_parents=True)
    assert "new-container" in fs.ls(".")
    fs.rm("new-container")

    # Verify a new container will not be created when create_parents
    # is False
    with pytest.raises(PermissionError):
        fs.mkdir("new-container", create_parents=False)

    # Test creating subdirectory when container does not exist
    # Since mkdir is a no-op, if create_parents=True, it will create
    # the top level container, but will NOT create nested directories
    fs.mkdir("new-container/dir", create_parents=True)
    assert "new-container/dir" not in fs.ls("new-container")
    assert "new-container" in fs.ls(".")
    fs.rm("new-container", recursive=True)

    # Test that creating a directory when already exists passes
    fs.mkdir("data")
    assert "data" in fs.ls(".")

    # Test raising error when container does not exist
    with pytest.raises(PermissionError):
        fs.mkdir("new-container/dir", create_parents=False)
Exemple #3
0
def test_rm(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.rm("/data/root/a/file.txt")

    with pytest.raises(FileNotFoundError):
        fs.ls("/data/root/a/file.txt", refresh=True)
Exemple #4
0
def test_rm_recursive(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)

    assert "data/root/c/" in fs.ls("/data/root")

    assert fs.ls("data/root/c") == [
        "data/root/c/file1.txt",
        "data/root/c/file2.txt",
    ]
    fs.rm("data/root/c", recursive=True)
    assert "data/root/c/" not in fs.ls("/data/root")

    with pytest.raises(FileNotFoundError):
        fs.ls("data/root/c")
Exemple #5
0
def main(args):
    # distributed setup
    print("initializing...")
    dask_mpi.initialize(nthreads=args.cpus_per_node)
    client = Client()
    print(client)

    # get data
    print("connecting to data...")
    print(client)
    container_name = "malware"
    storage_options = {"account_name": "azuremlexamples"}
    fs = AzureBlobFileSystem(**storage_options)
    files = fs.ls(f"{container_name}/processed")

    # read into dataframes
    print("creating dataframes...")
    print(client)
    for f in files:
        if "train" in f:
            df_train = dd.read_parquet(f"az://{f}", storage_options=storage_options)
        elif "test" in f:
            df_test = dd.read_parquet(f"az://{f}", storage_options=storage_options)

    # data processing
    print("processing data...")
    print(client)
    cols = [col for col in df_train.columns if df_train.dtypes[col] != "object"]
    X = df_train[cols].drop("HasDetections", axis=1).values.persist()
    y = df_train["HasDetections"].persist()

    # train xgboost
    print("training xgboost...")
    print(client)

    params = {
        "objective": "binary:logistic",
        "learning_rate": args.learning_rate,
        "gamma": args.gamma,
        "max_depth": args.max_depth,
    }
    mlflow.log_params(params)  # log to the run

    dtrain = xgb.dask.DaskDMatrix(client, X, y)
    model = xgb.dask.train(client, params, dtrain, num_boost_round=args.num_boost_round)
    print(model)

    # predict on test data
    print("making predictions...")
    print(client)
    X_test = df_test[
        [col for col in cols if "HasDetections" not in col]
    ].values.persist()
    y_pred = xgb.dask.predict(client, model, X_test)
    y_pred.to_dask_dataframe().to_csv("./outputs/predictions.csv")

    # save model
    print("saving model...")
    print(client)
    mlflow.xgboost.log_model(model["booster"], "./outputs/model")
Exemple #6
0
    def setup(self, stage=None):
        data_dir = "datasets/mnist"
        storage_options = {"account_name": "azuremlexamples"}
        fs = AzureBlobFileSystem(**storage_options)
        files = fs.ls(data_dir)

        train_len = 60000
        test_len = 10000

        for f in files:
            if "train-images" in f:
                self.X_train = self._read_images(gzip.open(fs.open(f)), train_len)
            elif "train-labels" in f:
                self.y_train = self._read_labels(gzip.open(fs.open(f)), train_len)
            elif "images" in f:
                self.X_test = self._read_images(gzip.open(fs.open(f)), test_len)
            elif "labels" in f:
                self.y_test = self._read_labels(gzip.open(fs.open(f)), test_len)

        self.ohe = OneHotEncoder().fit(self.y_train.reshape(-1, 1))

        self.mnist_train = list(
            zip(
                self.X_train, self.ohe.transform(self.y_train.reshape(-1, 1)).toarray(),
            )
        )
        self.mnist_test = list(
            zip(self.X_test, self.ohe.transform(self.y_test.reshape(-1, 1)).toarray(),)
        )
Exemple #7
0
def test_makedir(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )

    # Verify makedir will create a new container when create_parents is True
    with pytest.raises(FileExistsError):
        fs.makedir("data", exist_ok=False)

    # The container and directory already exist.  Should pass
    fs.makedir("data", exist_ok=True)
    assert "data/" in fs.ls(".")

    # Test creating subdirectory when container does not exist
    fs.makedir("new-container/dir")
    assert "new-container/dir" in fs.ls("new-container")
    fs.rm("new-container", recursive=True)
Exemple #8
0
def test_mkdir_rmdir(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )

    fs.mkdir("new-container")
    assert "new-container/" in fs.ls("")
    assert fs.ls("new-container") == []

    with fs.open(path="new-container/file.txt", mode="wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file2.txt", "wb") as f:
        f.write(b"0123456789")

    # Check to verify you can skip making a directory if the container
    # already exists, but still create a file in that directory
    fs.mkdir("new-container/dir/file.txt", exist_ok=False)
    assert "new-container/" in fs.ls("")

    fs.mkdir("new-container/file2.txt", exist_ok=False)
    assert "new-container/file2.txt" in fs.ls("new-container")

    # Test to verify that the file contains expected contents
    with fs.open("new-container/file2.txt", "rb") as f:
        outfile = f.read()
    assert outfile == b""

    # Check that trying to overwrite an existing nested file in append mode works as expected
    fs.mkdir("new-container/dir/file2.txt", exist_ok=False)
    assert "new-container/dir/file2.txt" in fs.ls("new-container/dir")

    # Also verify you can make a nested directory structure
    fs.mkdir("new-container/dir2/file.txt", exist_ok=False)
    with fs.open("new-container/dir2/file.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2")
    fs.rm("new-container/dir2", recursive=True)

    fs.rm("new-container/dir", recursive=True)
    assert fs.ls("new-container") == [
        "new-container/file.txt",
        "new-container/file2.txt",
    ]

    fs.rm("new-container/file.txt")
    fs.rm("new-container/file2.txt")
    fs.rmdir("new-container")

    assert "new-container/" not in fs.ls("")
Exemple #9
0
def test_mkdir_rmdir(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )

    fs.mkdir("new-container")
    assert "new-container/" in fs.ls("")
    assert fs.ls("new-container") == []

    with fs.open(path="new-container/file.txt", mode="wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file2.txt", "wb") as f:
        f.write(b"0123456789")

    # Check to verify you can skip making a directory if the container
    # already exists, but still create a file in that directory
    fs.mkdir("new-container/dir/file.txt", exists_ok=True)
    assert "new-container/" in fs.ls("")

    fs.mkdir("new-container/file2.txt", exists_ok=True)
    with fs.open("new-container/file2.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/file2.txt" in fs.ls("new-container")

    fs.mkdir("new-container/dir/file2.txt", exists_ok=True)
    with fs.open("new-container/dir/file2.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir/file2.txt" in fs.ls("new-container/dir")

    # Also verify you can make a nested directory structure
    fs.mkdir("new-container/dir2/file.txt", exists_ok=True)
    with fs.open("new-container/dir2/file.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2")
    fs.rm("new-container/dir2", recursive=True)

    fs.rm("new-container/dir", recursive=True)
    assert fs.ls("new-container") == [
        "new-container/file.txt",
        "new-container/file2.txt",
    ]

    fs.rm("new-container/file.txt")
    fs.rm("new-container/file2.txt")
    fs.rmdir("new-container")

    assert "new-container/" not in fs.ls("")
Exemple #10
0
def test_cp_file(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("homedir")
    fs.mkdir("homedir/enddir")
    fs.touch("homedir/startdir/test_file.txt")
    fs.cp_file("homedir/startdir/test_file.txt",
               "homedir/enddir/test_file.txt")
    files = fs.ls("homedir/enddir")
    assert "homedir/enddir/test_file.txt" in files

    fs.rm("homedir", recursive=True)
Exemple #11
0
def test_open_file(storage, mocker):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)

    f = fs.open("/data/root/a/file.txt")

    result = f.read()
    assert result == b"0123456789"

    close = mocker.patch.object(f.container_client, "close")
    f.close()
    print(fs.ls("/data/root/a"))

    close.assert_called_once()
Exemple #12
0
def test_mkdir(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )

    # Verify mkdir will create a new container when create_parents is True
    fs.mkdir("new-container", create_parents=True)
    assert "new-container/" in fs.ls(".")
    fs.rm("new-container")

    # Verify a new container will not be created when create_parents
    # is False
    with pytest.raises(PermissionError):
        fs.mkdir("new-container", create_parents=False)

    # Test creating subdirectory when container does not exist
    fs.mkdir("new-container/dir", create_parents=True)
    assert "new-container/dir" in fs.ls("new-container")
    fs.rm("new-container", recursive=True)

    # Test raising error when container does not exist
    with pytest.raises(PermissionError):
        fs.mkdir("new-container/dir", create_parents=False)
Exemple #13
0
    def fs(self):
        from adlfs import AzureBlobFileSystem
        from azure.core.exceptions import AzureError

        try:
            file_system = AzureBlobFileSystem(**self.login_info)
            if self.bucket not in [
                    container.rstrip("/") for container in file_system.ls("/")
            ]:
                file_system.mkdir(self.bucket)
        except (ValueError, AzureError) as e:
            raise AzureAuthError(
                f"Authentication to Azure Blob Storage via {self.login_method}"
                " failed.\nLearn more about configuration settings at"
                f" {format_link('https://man.dvc.org/remote/modify')}") from e

        return file_system
Exemple #14
0
def test_makedir_rmdir(storage, caplog):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )

    fs.makedir("new-container")
    assert "new-container" in fs.ls("")
    assert fs.ls("new-container") == []

    with fs.open(path="new-container/file.txt", mode="wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file2.txt", "wb") as f:
        f.write(b"0123456789")

    # Verify that mkdir will raise an exception if the directory exists
    # and exist_ok is False
    with pytest.raises(FileExistsError):
        fs.makedir("new-container/dir/file.txt", exist_ok=False)

    # mkdir should raise an error if the container exists and
    # we try to create a nested directory, with exist_ok=False
    with pytest.raises(FileExistsError):
        fs.makedir("new-container/dir2", exist_ok=False)

    # Check that trying to overwrite an existing nested file in append mode works as expected
    # if exist_ok is True
    fs.makedir("new-container/dir/file2.txt", exist_ok=True)
    assert "new-container/dir/file2.txt" in fs.ls("new-container/dir")

    # Also verify you can make a nested directory structure
    with fs.open("new-container/dir2/file.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2")
    fs.rm("new-container/dir2", recursive=True)

    fs.rm("new-container/dir", recursive=True)
    fs.touch("new-container/file2.txt")
    assert fs.ls("new-container") == [
        "new-container/file.txt",
        "new-container/file2.txt",
    ]

    fs.rm("new-container/file.txt")
    fs.rm("new-container/file2.txt")
    fs.rmdir("new-container")

    assert "new-container" not in fs.ls("")
Exemple #15
0
def test_dask_parquet(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("test")
    STORAGE_OPTIONS = {
        "account_name": "devstoreaccount1",
        "connection_string": CONN_STR,
    }
    df = pd.DataFrame({
        "col1": [1, 2, 3, 4],
        "col2": [2, 4, 6, 8],
        "index_key": [1, 1, 2, 2],
        "partition_key": [1, 1, 2, 2],
    })

    dask_dataframe = dd.from_pandas(df, npartitions=1)
    for protocol in ["abfs", "az"]:
        dask_dataframe.to_parquet(
            "{}://[email protected]/test_group.parquet".format(
                protocol),
            storage_options=STORAGE_OPTIONS,
            engine="pyarrow",
        )

        fs = AzureBlobFileSystem(**STORAGE_OPTIONS)
        assert fs.ls("test/test_group.parquet") == [
            "test/test_group.parquet/_common_metadata",
            "test/test_group.parquet/_metadata",
            "test/test_group.parquet/part.0.parquet",
        ]
        fs.rm("test/test_group.parquet")

    df_test = dd.read_parquet(
        "abfs://test/test_group.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df, df_test)
Exemple #16
0
def test_deep_paths(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)

    fs.mkdir("test_deep")
    assert "test_deep/" in fs.ls("")

    with fs.open("test_deep/a/b/c/file.txt", "wb") as f:
        f.write(b"0123456789")

    assert fs.ls("test_deep") == ["test_deep/a/"]
    assert fs.ls("test_deep/") == ["test_deep/a/"]
    assert fs.ls("test_deep/a") == ["test_deep/a/b/"]
    assert fs.ls("test_deep/a/") == ["test_deep/a/b/"]
    assert fs.find("test_deep") == ["test_deep/a/b/c/file.txt"]
    assert fs.find("test_deep/") == ["test_deep/a/b/c/file.txt"]
    assert fs.find("test_deep/a") == ["test_deep/a/b/c/file.txt"]
    assert fs.find("test_deep/a/") == ["test_deep/a/b/c/file.txt"]

    fs.rm("test_deep", recursive=True)

    assert "test_deep/" not in fs.ls("")
    assert fs.find("test_deep") == []
Exemple #17
0
def test_ls(storage):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name, connection_string=CONN_STR,
    )

    ## these are containers
    assert fs.ls("") == ["data"]
    assert fs.ls("/") == ["data"]
    assert fs.ls(".") == ["data"]
    assert fs.ls("*") == ["data"]

    ## these are top-level directories and files
    assert fs.ls("data") == ["data/root", "data/top_file.txt"]
    assert fs.ls("/data") == ["data/root", "data/top_file.txt"]

    # root contains files and directories
    assert fs.ls("data/root") == [
        "data/root/a",
        "data/root/a1",
        "data/root/b",
        "data/root/c",
        "data/root/d",
        "data/root/rfile.txt",
    ]
    assert fs.ls("data/root/") == [
        "data/root/a",
        "data/root/a1",
        "data/root/b",
        "data/root/c",
        "data/root/d",
        "data/root/rfile.txt",
    ]

    ## slashes are not not needed, but accepted
    assert fs.ls("data/root/a") == ["data/root/a/file.txt"]
    assert fs.ls("data/root/a/") == ["data/root/a/file.txt"]
    assert fs.ls("/data/root/a") == ["data/root/a/file.txt"]
    assert fs.ls("/data/root/a/") == ["data/root/a/file.txt"]
    assert fs.ls("data/root/b") == ["data/root/b/file.txt"]
    assert fs.ls("data/root/b/") == ["data/root/b/file.txt"]
    assert fs.ls("data/root/a1") == ["data/root/a1/file1.txt"]
    assert fs.ls("data/root/a1/") == ["data/root/a1/file1.txt"]

    ## file details
    files = fs.ls("data/root/a/file.txt", detail=True)
    assert_blobs_equals(
        files,
        [
            {
                "name": "data/root/a/file.txt",
                "size": 10,
                "type": "file",
                "archive_status": None,
                "deleted": None,
                "creation_time": storage.insert_time,
                "last_modified": storage.insert_time,
                "deleted_time": None,
                "last_accessed_on": None,
                "remaining_retention_days": None,
                "tag_count": None,
                "tags": None,
                "metadata": {},
                "content_settings": {
                    "content_type": "application/octet-stream",
                    "content_encoding": None,
                    "content_language": None,
                    "content_md5": bytearray(
                        b"x\x1e^$]i\xb5f\x97\x9b\x86\xe2\x8d#\xf2\xc7"
                    ),
                    "content_disposition": None,
                    "cache_control": None,
                },
            }
        ],
    )

    # c has two files
    assert_blobs_equals(
        fs.ls("data/root/c", detail=True),
        [
            {
                "name": "data/root/c/file1.txt",
                "size": 10,
                "type": "file",
                "archive_status": None,
                "deleted": None,
                "creation_time": storage.insert_time,
                "last_modified": storage.insert_time,
                "deleted_time": None,
                "last_accessed_on": None,
                "remaining_retention_days": None,
                "tag_count": None,
                "tags": None,
                "metadata": {},
                "content_settings": {
                    "content_type": "application/octet-stream",
                    "content_encoding": None,
                    "content_language": None,
                    "content_md5": bytearray(
                        b"x\x1e^$]i\xb5f\x97\x9b\x86\xe2\x8d#\xf2\xc7"
                    ),
                    "content_disposition": None,
                    "cache_control": None,
                },
            },
            {
                "name": "data/root/c/file2.txt",
                "size": 10,
                "type": "file",
                "archive_status": None,
                "deleted": None,
                "creation_time": storage.insert_time,
                "last_modified": storage.insert_time,
                "deleted_time": None,
                "last_accessed_on": None,
                "remaining_retention_days": None,
                "tag_count": None,
                "tags": None,
                "metadata": {},
                "content_settings": {
                    "content_type": "application/octet-stream",
                    "content_encoding": None,
                    "content_language": None,
                    "content_md5": bytearray(
                        b"x\x1e^$]i\xb5f\x97\x9b\x86\xe2\x8d#\xf2\xc7"
                    ),
                    "content_disposition": None,
                    "cache_control": None,
                },
            },
        ],
    )

    # with metadata
    assert_blobs_equals(
        fs.ls("data/root/d", detail=True),
        [
            {
                "name": "data/root/d/file_with_metadata.txt",
                "size": 10,
                "type": "file",
                "archive_status": None,
                "deleted": None,
                "creation_time": storage.insert_time,
                "last_modified": storage.insert_time,
                "deleted_time": None,
                "last_accessed_on": None,
                "remaining_retention_days": None,
                "tag_count": None,
                "tags": None,
                "metadata": {"meta": "data"},
                "content_settings": {
                    "content_type": "application/octet-stream",
                    "content_encoding": None,
                    "content_language": None,
                    "content_md5": bytearray(
                        b"x\x1e^$]i\xb5f\x97\x9b\x86\xe2\x8d#\xf2\xc7"
                    ),
                    "content_disposition": None,
                    "cache_control": None,
                },
            }
        ],
    )

    ## if not direct match is found throws error
    with pytest.raises(FileNotFoundError):
        fs.ls("not-a-container")

    with pytest.raises(FileNotFoundError):
        fs.ls("data/not-a-directory/")

    with pytest.raises(FileNotFoundError):
        fs.ls("data/root/not-a-file.txt")
def test_log_large_dask_dataframe_to_azure(auth_method):
    # Create the environmental variables
    verify_auth_parameters_and_configure_env(auth_method)

    A = np.random.random_sample(size=(25000000, 6))
    df = pd.DataFrame(data=A, columns=list("ABCDEF"))
    ddf = dd.from_pandas(df, npartitions=10).persist()

    size = ddf.memory_usage().sum().compute()
    print(f"demo data has size:  {size // 1e6} MB")
    # Verify that the size of the dataframe is > 1GB, and so
    # will write a collection of files, instead of a single
    # file
    assert (size // 1e6) > 1100

    # Create environmental vars
    context = mlrun.get_or_create_ctx("test")

    # Define the artifact location
    target_path = "az://" + config["env"].get("AZURE_CONTAINER") + "/"

    context.log_dataset(
        key="demo_data",
        df=ddf,
        format="parquet",
        artifact_path=target_path,
        stats=True,
    )

    data_item2 = mlrun.get_dataitem(f"{target_path}demo_data.parquet")
    ddf2 = data_item2.as_df(df_module=dd)

    # Check that a collection of files is written to Azure,
    # rather than a single parquet file
    from adlfs import AzureBlobFileSystem

    fs = AzureBlobFileSystem(
        account_name=os.getenv("AZURE_STORAGE_ACCOUNT_NAME"),
        account_key=os.getenv("AZURE_STORAGE_ACCOUNT_KEY"),
        connection_string=os.getenv("AZURE_STORAGE_CONNECTION_STRING"),
        tenant_id=os.getenv("AZURE_STORAGE_TENANT_ID"),
        client_id=os.getenv("AZURE_STORAGE_CLIENT_ID"),
        client_secret=os.getenv("AZURE_STORAGE_CLIENT_SECRET"),
        sas_token=os.getenv("AZURE_STORAGE_SAS_TOKEN"),
    )
    # Verify that a directory was created, rather than a file
    path = target_path.partition("//")[2]
    path = os.path.join(path, "demo_data.parquet")
    assert fs.isdir(path) is True

    # Verify that a collection of files was written
    files = fs.ls(path)
    assert len(files) > 4

    df2 = ddf2.compute()
    df2 = df2.reset_index(drop=True)
    df = ddf.compute()
    df = df.reset_index(drop=True)
    # Verify that the returned dataframe matches the original
    pd.testing.assert_frame_equal(df,
                                  df2,
                                  check_index_type=False,
                                  check_less_precise=True)
Exemple #19
0
def test_dask_parquet(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    fs.mkdir("test")
    STORAGE_OPTIONS = {
        "account_name": "devstoreaccount1",
        "connection_string": CONN_STR,
    }
    df = pd.DataFrame({
        "col1": [1, 2, 3, 4],
        "col2": [2, 4, 6, 8],
        "index_key": [1, 1, 2, 2],
        "partition_key": [1, 1, 2, 2],
    })

    dask_dataframe = dd.from_pandas(df, npartitions=1)
    for protocol in ["abfs", "az"]:
        dask_dataframe.to_parquet(
            "{}://test/test_group.parquet".format(protocol),
            storage_options=STORAGE_OPTIONS,
            engine="pyarrow",
        )

        fs = AzureBlobFileSystem(**STORAGE_OPTIONS)
        assert fs.ls("test/test_group.parquet") == [
            "test/test_group.parquet/_common_metadata",
            "test/test_group.parquet/_metadata",
            "test/test_group.parquet/part.0.parquet",
        ]
        fs.rm("test/test_group.parquet")

    df_test = dd.read_parquet(
        "abfs://test/test_group.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df, df_test)

    A = np.random.randint(0, 100, size=(10000, 4))
    df2 = pd.DataFrame(data=A, columns=list("ABCD"))
    ddf2 = dd.from_pandas(df2, npartitions=4)
    dd.to_parquet(
        ddf2,
        "abfs://test/test_group2.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    )
    assert fs.ls("test/test_group2.parquet") == [
        "test/test_group2.parquet/_common_metadata",
        "test/test_group2.parquet/_metadata",
        "test/test_group2.parquet/part.0.parquet",
        "test/test_group2.parquet/part.1.parquet",
        "test/test_group2.parquet/part.2.parquet",
        "test/test_group2.parquet/part.3.parquet",
    ]
    df2_test = dd.read_parquet(
        "abfs://test/test_group2.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df2, df2_test)

    a = np.full(shape=(10000, 1), fill_value=1)
    b = np.full(shape=(10000, 1), fill_value=2)
    c = np.full(shape=(10000, 1), fill_value=3)
    d = np.full(shape=(10000, 1), fill_value=4)
    B = np.concatenate((a, b, c, d), axis=1)
    df3 = pd.DataFrame(data=B, columns=list("ABCD"))
    ddf3 = dd.from_pandas(df3, npartitions=4)
    dd.to_parquet(
        ddf3,
        "abfs://test/test_group3.parquet",
        partition_on=["A", "B"],
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    )
    assert fs.glob("test/test_group3.parquet/*") == [
        "test/test_group3.parquet/A=1",
        "test/test_group3.parquet/_common_metadata",
        "test/test_group3.parquet/_metadata",
    ]
    df3_test = dd.read_parquet(
        "abfs://test/test_group3.parquet",
        filters=[("A", "=", 1)],
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    df3_test = df3_test[["A", "B", "C", "D"]]
    df3_test = df3_test[["A", "B", "C", "D"]].astype(int)
    assert_frame_equal(df3, df3_test)

    A = np.random.randint(0, 100, size=(10000, 4))
    df4 = pd.DataFrame(data=A, columns=list("ABCD"))
    ddf4 = dd.from_pandas(df4, npartitions=4)
    dd.to_parquet(
        ddf4,
        "abfs://test/test_group4.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
        flavor="spark",
        write_statistics=False,
    )
    fs.rmdir("test/test_group4.parquet/_common_metadata", recursive=True)
    fs.rmdir("test/test_group4.parquet/_metadata", recursive=True)
    fs.rm("test/test_group4.parquet/_common_metadata")
    fs.rm("test/test_group4.parquet/_metadata")
    assert fs.ls("test/test_group4.parquet") == [
        "test/test_group4.parquet/part.0.parquet",
        "test/test_group4.parquet/part.1.parquet",
        "test/test_group4.parquet/part.2.parquet",
        "test/test_group4.parquet/part.3.parquet",
    ]
    df4_test = dd.read_parquet(
        "abfs://test/test_group4.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df4, df4_test)

    A = np.random.randint(0, 100, size=(10000, 4))
    df5 = pd.DataFrame(data=A, columns=list("ABCD"))
    ddf5 = dd.from_pandas(df5, npartitions=4)
    dd.to_parquet(
        ddf5,
        "abfs://test/test group5.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    )
    assert fs.ls("test/test group5.parquet") == [
        "test/test group5.parquet/_common_metadata",
        "test/test group5.parquet/_metadata",
        "test/test group5.parquet/part.0.parquet",
        "test/test group5.parquet/part.1.parquet",
        "test/test group5.parquet/part.2.parquet",
        "test/test group5.parquet/part.3.parquet",
    ]
    df5_test = dd.read_parquet(
        "abfs://test/test group5.parquet",
        storage_options=STORAGE_OPTIONS,
        engine="pyarrow",
    ).compute()
    assert_frame_equal(df5, df5_test)
Exemple #20
0
def test_ls(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)

    ## these are containers
    assert fs.ls("") == ["data/"]
    assert fs.ls("/") == ["data/"]
    assert fs.ls(".") == ["data/"]

    ## these are top-level directories and files
    assert fs.ls("data") == ["data/root/", "data/top_file.txt"]
    assert fs.ls("/data") == ["data/root/", "data/top_file.txt"]

    # root contains files and directories
    assert fs.ls("data/root") == [
        "data/root/a/",
        "data/root/b/",
        "data/root/c/",
        "data/root/rfile.txt",
    ]
    assert fs.ls("data/root/") == [
        "data/root/a/",
        "data/root/b/",
        "data/root/c/",
        "data/root/rfile.txt",
    ]

    ## slashes are not not needed, but accepted
    assert fs.ls("data/root/a") == ["data/root/a/file.txt"]
    assert fs.ls("data/root/a/") == ["data/root/a/file.txt"]
    assert fs.ls("/data/root/a") == ["data/root/a/file.txt"]
    assert fs.ls("/data/root/a/") == ["data/root/a/file.txt"]

    ## file details
    assert fs.ls("data/root/a/file.txt", detail=True) == [{
        "name": "data/root/a/file.txt",
        "size": 10,
        "type": "file"
    }]

    # c has two files
    assert fs.ls("data/root/c", detail=True) == [
        {
            "name": "data/root/c/file1.txt",
            "size": 10,
            "type": "file"
        },
        {
            "name": "data/root/c/file2.txt",
            "size": 10,
            "type": "file"
        },
    ]

    ## if not direct match is found throws error
    with pytest.raises(FileNotFoundError):
        fs.ls("not-a-container")

    with pytest.raises(FileNotFoundError):
        fs.ls("data/not-a-directory/")

    with pytest.raises(FileNotFoundError):
        fs.ls("data/root/not-a-file.txt")
Exemple #21
0
class AzureDataLake(Source):
    """
    A class for pulling data from the Azure Data Lakes (gen1 and gen2).
    You can either connect to the lake in general or to a particular path,
    eg.
    lake = AzureDataLake(); lake.exists("a/b/c.csv")
    vs
    lake = AzureDataLake(path="a/b/c.csv"); lake.exists()

    Parameters
    ----------
    credentials : Dict[str, Any], optional
        A dictionary containing ACCOUNT_NAME and the following
        Service Principal credentials:
            - AZURE_TENANT_ID
            - AZURE_CLIENT_ID
            - AZURE_CLIENT_SECRET
    """

    def __init__(
        self,
        path: str = None,
        gen: int = 2,
        credentials: Dict[str, Any] = None,
        *args,
        **kwargs,
    ):

        credentials = credentials or local_config.get("AZURE_ADLS")

        super().__init__(*args, credentials=credentials, **kwargs)

        storage_account_name = self.credentials["ACCOUNT_NAME"]
        tenant_id = self.credentials["AZURE_TENANT_ID"]
        client_id = self.credentials["AZURE_CLIENT_ID"]
        client_secret = self.credentials["AZURE_CLIENT_SECRET"]

        self.path = path
        self.gen = gen
        self.storage_options = {
            "tenant_id": tenant_id,
            "client_id": client_id,
            "client_secret": client_secret,
        }
        if gen == 1:
            self.fs = AzureDatalakeFileSystem(
                store_name=storage_account_name,
                tenant_id=tenant_id,
                client_id=client_id,
                client_secret=client_secret,
            )
            self.base_url = f"adl://{storage_account_name}"
        elif gen == 2:
            self.storage_options["account_name"] = storage_account_name
            self.fs = AzureBlobFileSystem(
                account_name=storage_account_name,
                tenant_id=tenant_id,
                client_id=client_id,
                client_secret=client_secret,
            )
            self.base_url = f"az://"

    def upload(
        self,
        from_path: str,
        to_path: str = None,
        recursive: bool = False,
        overwrite: bool = False,
    ) -> None:
        """
        Upload file(s) to the lake.

        Args:
            from_path (str): Path to the local file(s) to be uploaded.
            to_path (str): Path to the destination file/folder
            recursive (bool): Set this to true if working with directories.
            overwrite (bool): Whether to overwrite the file(s) if they exist.

        Example:
        ```python
        from viadot.sources import AzureDataLake
        lake = AzureDataLake()
        lake.upload(from_path='tests/test.csv', to_path="sandbox/test.csv")
        ```
        """

        if self.gen == 1:
            raise NotImplemented(
                "Azure Data Lake Gen1 does not support simple file upload."
            )

        to_path = to_path or self.path
        self.fs.upload(
            lpath=from_path,
            rpath=to_path,
            recursive=recursive,
            overwrite=overwrite,
        )

    def exists(self, path: str = None) -> bool:
        """
        Check if a location exists in Azure Data Lake.

        Args:
            path (str): The path to check. Can be a file or a directory.

        Example:
        ```python
        from viadot.sources import AzureDataLake

        lake = AzureDataLake(gen=1)
        lake.exists("tests/test.csv")
        ```

        Returns:
            bool: Whether the paths exists.
        """
        path = path or self.path
        return self.fs.exists(path)

    def download(
        self,
        to_path: str,
        from_path: str = None,
        recursive: bool = False,
        overwrite: bool = True,
    ) -> None:
        if overwrite is False:
            raise NotImplemented(
                "Currently, only the default behavior (overwrite) is available."
            )

        from_path = from_path or self.path
        self.fs.download(rpath=from_path, lpath=to_path, recursive=recursive)

    def to_df(
        self,
        path: str = None,
        sep: str = "\t",
        quoting: int = 0,
        lineterminator: str = None,
        error_bad_lines: bool = None,
    ):
        if quoting is None:
            quoting = 0

        path = path or self.path
        url = os.path.join(self.base_url, path)

        if url.endswith(".csv"):
            df = pd.read_csv(
                url,
                storage_options=self.storage_options,
                sep=sep,
                quoting=quoting,
                lineterminator=lineterminator,
                error_bad_lines=error_bad_lines,
            )
        elif url.endswith(".parquet"):
            df = pd.read_parquet(url, storage_options=self.storage_options)
        else:
            raise ValueError("Only CSV and parquet formats are supported.")

        return df

    def ls(self, path: str = None) -> List[str]:
        path = path or self.path
        return self.fs.ls(path)

    def rm(self, path: str = None, recursive: bool = False):
        path = path or self.path
        self.fs.rm(path, recursive=recursive)

    def cp(self, from_path: str = None, to_path: str = None, recursive: bool = False):
        from_path = from_path or self.path
        to_path = to_path
        self.fs.cp(from_path, to_path, recursive=recursive)
Exemple #22
0
def test_makedir_rmdir(storage, caplog):
    fs = AzureBlobFileSystem(
        account_name=storage.account_name,
        connection_string=CONN_STR,
    )

    fs.makedir("new-container")
    assert "new-container/" in fs.ls("")
    assert fs.ls("new-container") == []

    with fs.open(path="new-container/file.txt", mode="wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file.txt", "wb") as f:
        f.write(b"0123456789")

    with fs.open("new-container/dir/file2.txt", "wb") as f:
        f.write(b"0123456789")

    # Verify that mkdir will raise an exception if the directory exists
    # and exist_ok is False
    with pytest.raises(FileExistsError):
        fs.makedir("new-container/dir/file.txt", exist_ok=False)

    # Verify that mkdir creates a directory if exist_ok is False and the
    # directory does not exist
    fs.makedir("new-container/file2.txt", exist_ok=False)
    assert "new-container/file2.txt" in fs.ls("new-container")

    # Verify that mkdir will silently ignore an existing directory if
    # the directory exists and exist_ok is True
    fs.makedir("new-container/dir", exist_ok=True)
    assert "new-container/dir/" in fs.ls("new-container")

    # Test to verify that the file contains expected contents
    with fs.open("new-container/file2.txt", "rb") as f:
        outfile = f.read()
    assert outfile == b""

    # Check that trying to overwrite an existing nested file in append mode works as expected
    # if exist_ok is True
    fs.makedir("new-container/dir/file2.txt", exist_ok=True)
    assert "new-container/dir/file2.txt" in fs.ls("new-container/dir")

    # Also verify you can make a nested directory structure
    fs.makedir("new-container/dir2/file.txt", exist_ok=False)
    with fs.open("new-container/dir2/file.txt", "wb") as f:
        f.write(b"0123456789")
    assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2")
    fs.rm("new-container/dir2", recursive=True)

    fs.rm("new-container/dir", recursive=True)
    assert fs.ls("new-container") == [
        "new-container/file.txt",
        "new-container/file2.txt",
    ]

    fs.rm("new-container/file.txt")
    fs.rm("new-container/file2.txt")
    fs.rmdir("new-container")

    assert "new-container/" not in fs.ls("")
Exemple #23
0
from distributed import Client
from adlfs import AzureBlobFileSystem

# setup variables
container_name = "malware"
storage_options = {"account_name": "azuremlexamples"}

# create distributed client
c = Client()

# create Azure filesystem
fs = AzureBlobFileSystem(**storage_options)

# list files
files = fs.ls(f"{container_name}/processed")

# read in training data
for f in files:
    if "train" in f:
        df = dd.read_parquet(f"az://{f}", storage_options=storage_options)

# advanced feature engineering
cols = [col for col in df.columns if df.dtypes[col] != "object"]

# define system input and output
X = df[cols].drop("HasDetections", axis=1).values.persist()
y = df["HasDetections"].values.persist()

# print something
print(len(X))