def test_isdir(storage): fs = AzureBlobFileSystem(account_name=storage.account_name, connection_string=CONN_STR) BUCKET = "/name/of/the/bucket" BASE_PATH = BUCKET + "/" + "012345" # EMPTY_DIR = BASE_PATH + "/empty_dir" fs.makedirs(BASE_PATH) assert fs.isdir(BASE_PATH) is True
def test_log_large_dask_dataframe_to_azure(auth_method): # Create the environmental variables verify_auth_parameters_and_configure_env(auth_method) A = np.random.random_sample(size=(25000000, 6)) df = pd.DataFrame(data=A, columns=list("ABCDEF")) ddf = dd.from_pandas(df, npartitions=10).persist() size = ddf.memory_usage().sum().compute() print(f"demo data has size: {size // 1e6} MB") # Verify that the size of the dataframe is > 1GB, and so # will write a collection of files, instead of a single # file assert (size // 1e6) > 1100 # Create environmental vars context = mlrun.get_or_create_ctx("test") # Define the artifact location target_path = "az://" + config["env"].get("AZURE_CONTAINER") + "/" context.log_dataset( key="demo_data", df=ddf, format="parquet", artifact_path=target_path, stats=True, ) data_item2 = mlrun.get_dataitem(f"{target_path}demo_data.parquet") ddf2 = data_item2.as_df(df_module=dd) # Check that a collection of files is written to Azure, # rather than a single parquet file from adlfs import AzureBlobFileSystem fs = AzureBlobFileSystem( account_name=os.getenv("AZURE_STORAGE_ACCOUNT_NAME"), account_key=os.getenv("AZURE_STORAGE_ACCOUNT_KEY"), connection_string=os.getenv("AZURE_STORAGE_CONNECTION_STRING"), tenant_id=os.getenv("AZURE_STORAGE_TENANT_ID"), client_id=os.getenv("AZURE_STORAGE_CLIENT_ID"), client_secret=os.getenv("AZURE_STORAGE_CLIENT_SECRET"), sas_token=os.getenv("AZURE_STORAGE_SAS_TOKEN"), ) # Verify that a directory was created, rather than a file path = target_path.partition("//")[2] path = os.path.join(path, "demo_data.parquet") assert fs.isdir(path) is True # Verify that a collection of files was written files = fs.ls(path) assert len(files) > 4 df2 = ddf2.compute() df2 = df2.reset_index(drop=True) df = ddf.compute() df = df.reset_index(drop=True) # Verify that the returned dataframe matches the original pd.testing.assert_frame_equal(df, df2, check_index_type=False, check_less_precise=True)