def test_mkdir_rm_recursive(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_mkdir_rm_recursive") assert "test_mkdir_rm_recursive/" in fs.ls("") with fs.open("test_mkdir_rm_recursive/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("test_mkdir_rm_recursive/dir/file.txt", "wb") as f: f.write(b"ABCD") with fs.open("test_mkdir_rm_recursive/dir/file2.txt", "wb") as f: f.write(b"abcdef") assert fs.find("test_mkdir_rm_recursive") == [ "test_mkdir_rm_recursive/dir/file.txt", "test_mkdir_rm_recursive/dir/file2.txt", "test_mkdir_rm_recursive/file.txt", ] fs.rm("test_mkdir_rm_recursive", recursive=True) assert "test_mkdir_rm_recursive/" not in fs.ls("") assert fs.find("test_mkdir_rm_recursive") == []
def test_mkdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) # Verify mkdir will create a new container when create_parents is True fs.mkdir("new-container", create_parents=True) assert "new-container" in fs.ls(".") fs.rm("new-container") # Verify a new container will not be created when create_parents # is False with pytest.raises(PermissionError): fs.mkdir("new-container", create_parents=False) # Test creating subdirectory when container does not exist # Since mkdir is a no-op, if create_parents=True, it will create # the top level container, but will NOT create nested directories fs.mkdir("new-container/dir", create_parents=True) assert "new-container/dir" not in fs.ls("new-container") assert "new-container" in fs.ls(".") fs.rm("new-container", recursive=True) # Test that creating a directory when already exists passes fs.mkdir("data") assert "data" in fs.ls(".") # Test raising error when container does not exist with pytest.raises(PermissionError): fs.mkdir("new-container/dir", create_parents=False)
def test_rm(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.rm("/data/root/a/file.txt") with pytest.raises(FileNotFoundError): fs.ls("/data/root/a/file.txt", refresh=True)
def test_rm_recursive(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) assert "data/root/c/" in fs.ls("/data/root") assert fs.ls("data/root/c") == [ "data/root/c/file1.txt", "data/root/c/file2.txt", ] fs.rm("data/root/c", recursive=True) assert "data/root/c/" not in fs.ls("/data/root") with pytest.raises(FileNotFoundError): fs.ls("data/root/c")
def main(args): # distributed setup print("initializing...") dask_mpi.initialize(nthreads=args.cpus_per_node) client = Client() print(client) # get data print("connecting to data...") print(client) container_name = "malware" storage_options = {"account_name": "azuremlexamples"} fs = AzureBlobFileSystem(**storage_options) files = fs.ls(f"{container_name}/processed") # read into dataframes print("creating dataframes...") print(client) for f in files: if "train" in f: df_train = dd.read_parquet(f"az://{f}", storage_options=storage_options) elif "test" in f: df_test = dd.read_parquet(f"az://{f}", storage_options=storage_options) # data processing print("processing data...") print(client) cols = [col for col in df_train.columns if df_train.dtypes[col] != "object"] X = df_train[cols].drop("HasDetections", axis=1).values.persist() y = df_train["HasDetections"].persist() # train xgboost print("training xgboost...") print(client) params = { "objective": "binary:logistic", "learning_rate": args.learning_rate, "gamma": args.gamma, "max_depth": args.max_depth, } mlflow.log_params(params) # log to the run dtrain = xgb.dask.DaskDMatrix(client, X, y) model = xgb.dask.train(client, params, dtrain, num_boost_round=args.num_boost_round) print(model) # predict on test data print("making predictions...") print(client) X_test = df_test[ [col for col in cols if "HasDetections" not in col] ].values.persist() y_pred = xgb.dask.predict(client, model, X_test) y_pred.to_dask_dataframe().to_csv("./outputs/predictions.csv") # save model print("saving model...") print(client) mlflow.xgboost.log_model(model["booster"], "./outputs/model")
def setup(self, stage=None): data_dir = "datasets/mnist" storage_options = {"account_name": "azuremlexamples"} fs = AzureBlobFileSystem(**storage_options) files = fs.ls(data_dir) train_len = 60000 test_len = 10000 for f in files: if "train-images" in f: self.X_train = self._read_images(gzip.open(fs.open(f)), train_len) elif "train-labels" in f: self.y_train = self._read_labels(gzip.open(fs.open(f)), train_len) elif "images" in f: self.X_test = self._read_images(gzip.open(fs.open(f)), test_len) elif "labels" in f: self.y_test = self._read_labels(gzip.open(fs.open(f)), test_len) self.ohe = OneHotEncoder().fit(self.y_train.reshape(-1, 1)) self.mnist_train = list( zip( self.X_train, self.ohe.transform(self.y_train.reshape(-1, 1)).toarray(), ) ) self.mnist_test = list( zip(self.X_test, self.ohe.transform(self.y_test.reshape(-1, 1)).toarray(),) )
def test_makedir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) # Verify makedir will create a new container when create_parents is True with pytest.raises(FileExistsError): fs.makedir("data", exist_ok=False) # The container and directory already exist. Should pass fs.makedir("data", exist_ok=True) assert "data/" in fs.ls(".") # Test creating subdirectory when container does not exist fs.makedir("new-container/dir") assert "new-container/dir" in fs.ls("new-container") fs.rm("new-container", recursive=True)
def test_mkdir_rmdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.mkdir("new-container") assert "new-container/" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Check to verify you can skip making a directory if the container # already exists, but still create a file in that directory fs.mkdir("new-container/dir/file.txt", exist_ok=False) assert "new-container/" in fs.ls("") fs.mkdir("new-container/file2.txt", exist_ok=False) assert "new-container/file2.txt" in fs.ls("new-container") # Test to verify that the file contains expected contents with fs.open("new-container/file2.txt", "rb") as f: outfile = f.read() assert outfile == b"" # Check that trying to overwrite an existing nested file in append mode works as expected fs.mkdir("new-container/dir/file2.txt", exist_ok=False) assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure fs.mkdir("new-container/dir2/file.txt", exist_ok=False) with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container/" not in fs.ls("")
def test_mkdir_rmdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.mkdir("new-container") assert "new-container/" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Check to verify you can skip making a directory if the container # already exists, but still create a file in that directory fs.mkdir("new-container/dir/file.txt", exists_ok=True) assert "new-container/" in fs.ls("") fs.mkdir("new-container/file2.txt", exists_ok=True) with fs.open("new-container/file2.txt", "wb") as f: f.write(b"0123456789") assert "new-container/file2.txt" in fs.ls("new-container") fs.mkdir("new-container/dir/file2.txt", exists_ok=True) with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure fs.mkdir("new-container/dir2/file.txt", exists_ok=True) with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container/" not in fs.ls("")
def test_cp_file(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("homedir") fs.mkdir("homedir/enddir") fs.touch("homedir/startdir/test_file.txt") fs.cp_file("homedir/startdir/test_file.txt", "homedir/enddir/test_file.txt") files = fs.ls("homedir/enddir") assert "homedir/enddir/test_file.txt" in files fs.rm("homedir", recursive=True)
def test_open_file(storage, mocker): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) f = fs.open("/data/root/a/file.txt") result = f.read() assert result == b"0123456789" close = mocker.patch.object(f.container_client, "close") f.close() print(fs.ls("/data/root/a")) close.assert_called_once()
def test_mkdir(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) # Verify mkdir will create a new container when create_parents is True fs.mkdir("new-container", create_parents=True) assert "new-container/" in fs.ls(".") fs.rm("new-container") # Verify a new container will not be created when create_parents # is False with pytest.raises(PermissionError): fs.mkdir("new-container", create_parents=False) # Test creating subdirectory when container does not exist fs.mkdir("new-container/dir", create_parents=True) assert "new-container/dir" in fs.ls("new-container") fs.rm("new-container", recursive=True) # Test raising error when container does not exist with pytest.raises(PermissionError): fs.mkdir("new-container/dir", create_parents=False)
def fs(self): from adlfs import AzureBlobFileSystem from azure.core.exceptions import AzureError try: file_system = AzureBlobFileSystem(**self.login_info) if self.bucket not in [ container.rstrip("/") for container in file_system.ls("/") ]: file_system.mkdir(self.bucket) except (ValueError, AzureError) as e: raise AzureAuthError( f"Authentication to Azure Blob Storage via {self.login_method}" " failed.\nLearn more about configuration settings at" f" {format_link('https://man.dvc.org/remote/modify')}") from e return file_system
def test_makedir_rmdir(storage, caplog): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.makedir("new-container") assert "new-container" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Verify that mkdir will raise an exception if the directory exists # and exist_ok is False with pytest.raises(FileExistsError): fs.makedir("new-container/dir/file.txt", exist_ok=False) # mkdir should raise an error if the container exists and # we try to create a nested directory, with exist_ok=False with pytest.raises(FileExistsError): fs.makedir("new-container/dir2", exist_ok=False) # Check that trying to overwrite an existing nested file in append mode works as expected # if exist_ok is True fs.makedir("new-container/dir/file2.txt", exist_ok=True) assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) fs.touch("new-container/file2.txt") assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container" not in fs.ls("")
def test_dask_parquet(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test") STORAGE_OPTIONS = { "account_name": "devstoreaccount1", "connection_string": CONN_STR, } df = pd.DataFrame({ "col1": [1, 2, 3, 4], "col2": [2, 4, 6, 8], "index_key": [1, 1, 2, 2], "partition_key": [1, 1, 2, 2], }) dask_dataframe = dd.from_pandas(df, npartitions=1) for protocol in ["abfs", "az"]: dask_dataframe.to_parquet( "{}://[email protected]/test_group.parquet".format( protocol), storage_options=STORAGE_OPTIONS, engine="pyarrow", ) fs = AzureBlobFileSystem(**STORAGE_OPTIONS) assert fs.ls("test/test_group.parquet") == [ "test/test_group.parquet/_common_metadata", "test/test_group.parquet/_metadata", "test/test_group.parquet/part.0.parquet", ] fs.rm("test/test_group.parquet") df_test = dd.read_parquet( "abfs://test/test_group.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df, df_test)
def test_deep_paths(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test_deep") assert "test_deep/" in fs.ls("") with fs.open("test_deep/a/b/c/file.txt", "wb") as f: f.write(b"0123456789") assert fs.ls("test_deep") == ["test_deep/a/"] assert fs.ls("test_deep/") == ["test_deep/a/"] assert fs.ls("test_deep/a") == ["test_deep/a/b/"] assert fs.ls("test_deep/a/") == ["test_deep/a/b/"] assert fs.find("test_deep") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/a") == ["test_deep/a/b/c/file.txt"] assert fs.find("test_deep/a/") == ["test_deep/a/b/c/file.txt"] fs.rm("test_deep", recursive=True) assert "test_deep/" not in fs.ls("") assert fs.find("test_deep") == []
def test_ls(storage): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) ## these are containers assert fs.ls("") == ["data"] assert fs.ls("/") == ["data"] assert fs.ls(".") == ["data"] assert fs.ls("*") == ["data"] ## these are top-level directories and files assert fs.ls("data") == ["data/root", "data/top_file.txt"] assert fs.ls("/data") == ["data/root", "data/top_file.txt"] # root contains files and directories assert fs.ls("data/root") == [ "data/root/a", "data/root/a1", "data/root/b", "data/root/c", "data/root/d", "data/root/rfile.txt", ] assert fs.ls("data/root/") == [ "data/root/a", "data/root/a1", "data/root/b", "data/root/c", "data/root/d", "data/root/rfile.txt", ] ## slashes are not not needed, but accepted assert fs.ls("data/root/a") == ["data/root/a/file.txt"] assert fs.ls("data/root/a/") == ["data/root/a/file.txt"] assert fs.ls("/data/root/a") == ["data/root/a/file.txt"] assert fs.ls("/data/root/a/") == ["data/root/a/file.txt"] assert fs.ls("data/root/b") == ["data/root/b/file.txt"] assert fs.ls("data/root/b/") == ["data/root/b/file.txt"] assert fs.ls("data/root/a1") == ["data/root/a1/file1.txt"] assert fs.ls("data/root/a1/") == ["data/root/a1/file1.txt"] ## file details files = fs.ls("data/root/a/file.txt", detail=True) assert_blobs_equals( files, [ { "name": "data/root/a/file.txt", "size": 10, "type": "file", "archive_status": None, "deleted": None, "creation_time": storage.insert_time, "last_modified": storage.insert_time, "deleted_time": None, "last_accessed_on": None, "remaining_retention_days": None, "tag_count": None, "tags": None, "metadata": {}, "content_settings": { "content_type": "application/octet-stream", "content_encoding": None, "content_language": None, "content_md5": bytearray( b"x\x1e^$]i\xb5f\x97\x9b\x86\xe2\x8d#\xf2\xc7" ), "content_disposition": None, "cache_control": None, }, } ], ) # c has two files assert_blobs_equals( fs.ls("data/root/c", detail=True), [ { "name": "data/root/c/file1.txt", "size": 10, "type": "file", "archive_status": None, "deleted": None, "creation_time": storage.insert_time, "last_modified": storage.insert_time, "deleted_time": None, "last_accessed_on": None, "remaining_retention_days": None, "tag_count": None, "tags": None, "metadata": {}, "content_settings": { "content_type": "application/octet-stream", "content_encoding": None, "content_language": None, "content_md5": bytearray( b"x\x1e^$]i\xb5f\x97\x9b\x86\xe2\x8d#\xf2\xc7" ), "content_disposition": None, "cache_control": None, }, }, { "name": "data/root/c/file2.txt", "size": 10, "type": "file", "archive_status": None, "deleted": None, "creation_time": storage.insert_time, "last_modified": storage.insert_time, "deleted_time": None, "last_accessed_on": None, "remaining_retention_days": None, "tag_count": None, "tags": None, "metadata": {}, "content_settings": { "content_type": "application/octet-stream", "content_encoding": None, "content_language": None, "content_md5": bytearray( b"x\x1e^$]i\xb5f\x97\x9b\x86\xe2\x8d#\xf2\xc7" ), "content_disposition": None, "cache_control": None, }, }, ], ) # with metadata assert_blobs_equals( fs.ls("data/root/d", detail=True), [ { "name": "data/root/d/file_with_metadata.txt", "size": 10, "type": "file", "archive_status": None, "deleted": None, "creation_time": storage.insert_time, "last_modified": storage.insert_time, "deleted_time": None, "last_accessed_on": None, "remaining_retention_days": None, "tag_count": None, "tags": None, "metadata": {"meta": "data"}, "content_settings": { "content_type": "application/octet-stream", "content_encoding": None, "content_language": None, "content_md5": bytearray( b"x\x1e^$]i\xb5f\x97\x9b\x86\xe2\x8d#\xf2\xc7" ), "content_disposition": None, "cache_control": None, }, } ], ) ## if not direct match is found throws error with pytest.raises(FileNotFoundError): fs.ls("not-a-container") with pytest.raises(FileNotFoundError): fs.ls("data/not-a-directory/") with pytest.raises(FileNotFoundError): fs.ls("data/root/not-a-file.txt")
def test_log_large_dask_dataframe_to_azure(auth_method): # Create the environmental variables verify_auth_parameters_and_configure_env(auth_method) A = np.random.random_sample(size=(25000000, 6)) df = pd.DataFrame(data=A, columns=list("ABCDEF")) ddf = dd.from_pandas(df, npartitions=10).persist() size = ddf.memory_usage().sum().compute() print(f"demo data has size: {size // 1e6} MB") # Verify that the size of the dataframe is > 1GB, and so # will write a collection of files, instead of a single # file assert (size // 1e6) > 1100 # Create environmental vars context = mlrun.get_or_create_ctx("test") # Define the artifact location target_path = "az://" + config["env"].get("AZURE_CONTAINER") + "/" context.log_dataset( key="demo_data", df=ddf, format="parquet", artifact_path=target_path, stats=True, ) data_item2 = mlrun.get_dataitem(f"{target_path}demo_data.parquet") ddf2 = data_item2.as_df(df_module=dd) # Check that a collection of files is written to Azure, # rather than a single parquet file from adlfs import AzureBlobFileSystem fs = AzureBlobFileSystem( account_name=os.getenv("AZURE_STORAGE_ACCOUNT_NAME"), account_key=os.getenv("AZURE_STORAGE_ACCOUNT_KEY"), connection_string=os.getenv("AZURE_STORAGE_CONNECTION_STRING"), tenant_id=os.getenv("AZURE_STORAGE_TENANT_ID"), client_id=os.getenv("AZURE_STORAGE_CLIENT_ID"), client_secret=os.getenv("AZURE_STORAGE_CLIENT_SECRET"), sas_token=os.getenv("AZURE_STORAGE_SAS_TOKEN"), ) # Verify that a directory was created, rather than a file path = target_path.partition("//")[2] path = os.path.join(path, "demo_data.parquet") assert fs.isdir(path) is True # Verify that a collection of files was written files = fs.ls(path) assert len(files) > 4 df2 = ddf2.compute() df2 = df2.reset_index(drop=True) df = ddf.compute() df = df.reset_index(drop=True) # Verify that the returned dataframe matches the original pd.testing.assert_frame_equal(df, df2, check_index_type=False, check_less_precise=True)
def test_dask_parquet(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) fs.mkdir("test") STORAGE_OPTIONS = { "account_name": "devstoreaccount1", "connection_string": CONN_STR, } df = pd.DataFrame({ "col1": [1, 2, 3, 4], "col2": [2, 4, 6, 8], "index_key": [1, 1, 2, 2], "partition_key": [1, 1, 2, 2], }) dask_dataframe = dd.from_pandas(df, npartitions=1) for protocol in ["abfs", "az"]: dask_dataframe.to_parquet( "{}://test/test_group.parquet".format(protocol), storage_options=STORAGE_OPTIONS, engine="pyarrow", ) fs = AzureBlobFileSystem(**STORAGE_OPTIONS) assert fs.ls("test/test_group.parquet") == [ "test/test_group.parquet/_common_metadata", "test/test_group.parquet/_metadata", "test/test_group.parquet/part.0.parquet", ] fs.rm("test/test_group.parquet") df_test = dd.read_parquet( "abfs://test/test_group.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df, df_test) A = np.random.randint(0, 100, size=(10000, 4)) df2 = pd.DataFrame(data=A, columns=list("ABCD")) ddf2 = dd.from_pandas(df2, npartitions=4) dd.to_parquet( ddf2, "abfs://test/test_group2.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.ls("test/test_group2.parquet") == [ "test/test_group2.parquet/_common_metadata", "test/test_group2.parquet/_metadata", "test/test_group2.parquet/part.0.parquet", "test/test_group2.parquet/part.1.parquet", "test/test_group2.parquet/part.2.parquet", "test/test_group2.parquet/part.3.parquet", ] df2_test = dd.read_parquet( "abfs://test/test_group2.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df2, df2_test) a = np.full(shape=(10000, 1), fill_value=1) b = np.full(shape=(10000, 1), fill_value=2) c = np.full(shape=(10000, 1), fill_value=3) d = np.full(shape=(10000, 1), fill_value=4) B = np.concatenate((a, b, c, d), axis=1) df3 = pd.DataFrame(data=B, columns=list("ABCD")) ddf3 = dd.from_pandas(df3, npartitions=4) dd.to_parquet( ddf3, "abfs://test/test_group3.parquet", partition_on=["A", "B"], storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.glob("test/test_group3.parquet/*") == [ "test/test_group3.parquet/A=1", "test/test_group3.parquet/_common_metadata", "test/test_group3.parquet/_metadata", ] df3_test = dd.read_parquet( "abfs://test/test_group3.parquet", filters=[("A", "=", 1)], storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() df3_test = df3_test[["A", "B", "C", "D"]] df3_test = df3_test[["A", "B", "C", "D"]].astype(int) assert_frame_equal(df3, df3_test) A = np.random.randint(0, 100, size=(10000, 4)) df4 = pd.DataFrame(data=A, columns=list("ABCD")) ddf4 = dd.from_pandas(df4, npartitions=4) dd.to_parquet( ddf4, "abfs://test/test_group4.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", flavor="spark", write_statistics=False, ) fs.rmdir("test/test_group4.parquet/_common_metadata", recursive=True) fs.rmdir("test/test_group4.parquet/_metadata", recursive=True) fs.rm("test/test_group4.parquet/_common_metadata") fs.rm("test/test_group4.parquet/_metadata") assert fs.ls("test/test_group4.parquet") == [ "test/test_group4.parquet/part.0.parquet", "test/test_group4.parquet/part.1.parquet", "test/test_group4.parquet/part.2.parquet", "test/test_group4.parquet/part.3.parquet", ] df4_test = dd.read_parquet( "abfs://test/test_group4.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df4, df4_test) A = np.random.randint(0, 100, size=(10000, 4)) df5 = pd.DataFrame(data=A, columns=list("ABCD")) ddf5 = dd.from_pandas(df5, npartitions=4) dd.to_parquet( ddf5, "abfs://test/test group5.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ) assert fs.ls("test/test group5.parquet") == [ "test/test group5.parquet/_common_metadata", "test/test group5.parquet/_metadata", "test/test group5.parquet/part.0.parquet", "test/test group5.parquet/part.1.parquet", "test/test group5.parquet/part.2.parquet", "test/test group5.parquet/part.3.parquet", ] df5_test = dd.read_parquet( "abfs://test/test group5.parquet", storage_options=STORAGE_OPTIONS, engine="pyarrow", ).compute() assert_frame_equal(df5, df5_test)
def test_ls(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) ## these are containers assert fs.ls("") == ["data/"] assert fs.ls("/") == ["data/"] assert fs.ls(".") == ["data/"] ## these are top-level directories and files assert fs.ls("data") == ["data/root/", "data/top_file.txt"] assert fs.ls("/data") == ["data/root/", "data/top_file.txt"] # root contains files and directories assert fs.ls("data/root") == [ "data/root/a/", "data/root/b/", "data/root/c/", "data/root/rfile.txt", ] assert fs.ls("data/root/") == [ "data/root/a/", "data/root/b/", "data/root/c/", "data/root/rfile.txt", ] ## slashes are not not needed, but accepted assert fs.ls("data/root/a") == ["data/root/a/file.txt"] assert fs.ls("data/root/a/") == ["data/root/a/file.txt"] assert fs.ls("/data/root/a") == ["data/root/a/file.txt"] assert fs.ls("/data/root/a/") == ["data/root/a/file.txt"] ## file details assert fs.ls("data/root/a/file.txt", detail=True) == [{ "name": "data/root/a/file.txt", "size": 10, "type": "file" }] # c has two files assert fs.ls("data/root/c", detail=True) == [ { "name": "data/root/c/file1.txt", "size": 10, "type": "file" }, { "name": "data/root/c/file2.txt", "size": 10, "type": "file" }, ] ## if not direct match is found throws error with pytest.raises(FileNotFoundError): fs.ls("not-a-container") with pytest.raises(FileNotFoundError): fs.ls("data/not-a-directory/") with pytest.raises(FileNotFoundError): fs.ls("data/root/not-a-file.txt")
class AzureDataLake(Source): """ A class for pulling data from the Azure Data Lakes (gen1 and gen2). You can either connect to the lake in general or to a particular path, eg. lake = AzureDataLake(); lake.exists("a/b/c.csv") vs lake = AzureDataLake(path="a/b/c.csv"); lake.exists() Parameters ---------- credentials : Dict[str, Any], optional A dictionary containing ACCOUNT_NAME and the following Service Principal credentials: - AZURE_TENANT_ID - AZURE_CLIENT_ID - AZURE_CLIENT_SECRET """ def __init__( self, path: str = None, gen: int = 2, credentials: Dict[str, Any] = None, *args, **kwargs, ): credentials = credentials or local_config.get("AZURE_ADLS") super().__init__(*args, credentials=credentials, **kwargs) storage_account_name = self.credentials["ACCOUNT_NAME"] tenant_id = self.credentials["AZURE_TENANT_ID"] client_id = self.credentials["AZURE_CLIENT_ID"] client_secret = self.credentials["AZURE_CLIENT_SECRET"] self.path = path self.gen = gen self.storage_options = { "tenant_id": tenant_id, "client_id": client_id, "client_secret": client_secret, } if gen == 1: self.fs = AzureDatalakeFileSystem( store_name=storage_account_name, tenant_id=tenant_id, client_id=client_id, client_secret=client_secret, ) self.base_url = f"adl://{storage_account_name}" elif gen == 2: self.storage_options["account_name"] = storage_account_name self.fs = AzureBlobFileSystem( account_name=storage_account_name, tenant_id=tenant_id, client_id=client_id, client_secret=client_secret, ) self.base_url = f"az://" def upload( self, from_path: str, to_path: str = None, recursive: bool = False, overwrite: bool = False, ) -> None: """ Upload file(s) to the lake. Args: from_path (str): Path to the local file(s) to be uploaded. to_path (str): Path to the destination file/folder recursive (bool): Set this to true if working with directories. overwrite (bool): Whether to overwrite the file(s) if they exist. Example: ```python from viadot.sources import AzureDataLake lake = AzureDataLake() lake.upload(from_path='tests/test.csv', to_path="sandbox/test.csv") ``` """ if self.gen == 1: raise NotImplemented( "Azure Data Lake Gen1 does not support simple file upload." ) to_path = to_path or self.path self.fs.upload( lpath=from_path, rpath=to_path, recursive=recursive, overwrite=overwrite, ) def exists(self, path: str = None) -> bool: """ Check if a location exists in Azure Data Lake. Args: path (str): The path to check. Can be a file or a directory. Example: ```python from viadot.sources import AzureDataLake lake = AzureDataLake(gen=1) lake.exists("tests/test.csv") ``` Returns: bool: Whether the paths exists. """ path = path or self.path return self.fs.exists(path) def download( self, to_path: str, from_path: str = None, recursive: bool = False, overwrite: bool = True, ) -> None: if overwrite is False: raise NotImplemented( "Currently, only the default behavior (overwrite) is available." ) from_path = from_path or self.path self.fs.download(rpath=from_path, lpath=to_path, recursive=recursive) def to_df( self, path: str = None, sep: str = "\t", quoting: int = 0, lineterminator: str = None, error_bad_lines: bool = None, ): if quoting is None: quoting = 0 path = path or self.path url = os.path.join(self.base_url, path) if url.endswith(".csv"): df = pd.read_csv( url, storage_options=self.storage_options, sep=sep, quoting=quoting, lineterminator=lineterminator, error_bad_lines=error_bad_lines, ) elif url.endswith(".parquet"): df = pd.read_parquet(url, storage_options=self.storage_options) else: raise ValueError("Only CSV and parquet formats are supported.") return df def ls(self, path: str = None) -> List[str]: path = path or self.path return self.fs.ls(path) def rm(self, path: str = None, recursive: bool = False): path = path or self.path self.fs.rm(path, recursive=recursive) def cp(self, from_path: str = None, to_path: str = None, recursive: bool = False): from_path = from_path or self.path to_path = to_path self.fs.cp(from_path, to_path, recursive=recursive)
def test_makedir_rmdir(storage, caplog): fs = AzureBlobFileSystem( account_name=storage.account_name, connection_string=CONN_STR, ) fs.makedir("new-container") assert "new-container/" in fs.ls("") assert fs.ls("new-container") == [] with fs.open(path="new-container/file.txt", mode="wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file.txt", "wb") as f: f.write(b"0123456789") with fs.open("new-container/dir/file2.txt", "wb") as f: f.write(b"0123456789") # Verify that mkdir will raise an exception if the directory exists # and exist_ok is False with pytest.raises(FileExistsError): fs.makedir("new-container/dir/file.txt", exist_ok=False) # Verify that mkdir creates a directory if exist_ok is False and the # directory does not exist fs.makedir("new-container/file2.txt", exist_ok=False) assert "new-container/file2.txt" in fs.ls("new-container") # Verify that mkdir will silently ignore an existing directory if # the directory exists and exist_ok is True fs.makedir("new-container/dir", exist_ok=True) assert "new-container/dir/" in fs.ls("new-container") # Test to verify that the file contains expected contents with fs.open("new-container/file2.txt", "rb") as f: outfile = f.read() assert outfile == b"" # Check that trying to overwrite an existing nested file in append mode works as expected # if exist_ok is True fs.makedir("new-container/dir/file2.txt", exist_ok=True) assert "new-container/dir/file2.txt" in fs.ls("new-container/dir") # Also verify you can make a nested directory structure fs.makedir("new-container/dir2/file.txt", exist_ok=False) with fs.open("new-container/dir2/file.txt", "wb") as f: f.write(b"0123456789") assert "new-container/dir2/file.txt" in fs.ls("new-container/dir2") fs.rm("new-container/dir2", recursive=True) fs.rm("new-container/dir", recursive=True) assert fs.ls("new-container") == [ "new-container/file.txt", "new-container/file2.txt", ] fs.rm("new-container/file.txt") fs.rm("new-container/file2.txt") fs.rmdir("new-container") assert "new-container/" not in fs.ls("")
from distributed import Client from adlfs import AzureBlobFileSystem # setup variables container_name = "malware" storage_options = {"account_name": "azuremlexamples"} # create distributed client c = Client() # create Azure filesystem fs = AzureBlobFileSystem(**storage_options) # list files files = fs.ls(f"{container_name}/processed") # read in training data for f in files: if "train" in f: df = dd.read_parquet(f"az://{f}", storage_options=storage_options) # advanced feature engineering cols = [col for col in df.columns if df.dtypes[col] != "object"] # define system input and output X = df[cols].drop("HasDetections", axis=1).values.persist() y = df["HasDetections"].values.persist() # print something print(len(X))