Exemple #1
0
def test_isdir(storage):
    fs = AzureBlobFileSystem(account_name=storage.account_name,
                             connection_string=CONN_STR)
    BUCKET = "/name/of/the/bucket"
    BASE_PATH = BUCKET + "/" + "012345"
    # EMPTY_DIR = BASE_PATH + "/empty_dir"

    fs.makedirs(BASE_PATH)
    assert fs.isdir(BASE_PATH) is True
def test_log_large_dask_dataframe_to_azure(auth_method):
    # Create the environmental variables
    verify_auth_parameters_and_configure_env(auth_method)

    A = np.random.random_sample(size=(25000000, 6))
    df = pd.DataFrame(data=A, columns=list("ABCDEF"))
    ddf = dd.from_pandas(df, npartitions=10).persist()

    size = ddf.memory_usage().sum().compute()
    print(f"demo data has size:  {size // 1e6} MB")
    # Verify that the size of the dataframe is > 1GB, and so
    # will write a collection of files, instead of a single
    # file
    assert (size // 1e6) > 1100

    # Create environmental vars
    context = mlrun.get_or_create_ctx("test")

    # Define the artifact location
    target_path = "az://" + config["env"].get("AZURE_CONTAINER") + "/"

    context.log_dataset(
        key="demo_data",
        df=ddf,
        format="parquet",
        artifact_path=target_path,
        stats=True,
    )

    data_item2 = mlrun.get_dataitem(f"{target_path}demo_data.parquet")
    ddf2 = data_item2.as_df(df_module=dd)

    # Check that a collection of files is written to Azure,
    # rather than a single parquet file
    from adlfs import AzureBlobFileSystem

    fs = AzureBlobFileSystem(
        account_name=os.getenv("AZURE_STORAGE_ACCOUNT_NAME"),
        account_key=os.getenv("AZURE_STORAGE_ACCOUNT_KEY"),
        connection_string=os.getenv("AZURE_STORAGE_CONNECTION_STRING"),
        tenant_id=os.getenv("AZURE_STORAGE_TENANT_ID"),
        client_id=os.getenv("AZURE_STORAGE_CLIENT_ID"),
        client_secret=os.getenv("AZURE_STORAGE_CLIENT_SECRET"),
        sas_token=os.getenv("AZURE_STORAGE_SAS_TOKEN"),
    )
    # Verify that a directory was created, rather than a file
    path = target_path.partition("//")[2]
    path = os.path.join(path, "demo_data.parquet")
    assert fs.isdir(path) is True

    # Verify that a collection of files was written
    files = fs.ls(path)
    assert len(files) > 4

    df2 = ddf2.compute()
    df2 = df2.reset_index(drop=True)
    df = ddf.compute()
    df = df.reset_index(drop=True)
    # Verify that the returned dataframe matches the original
    pd.testing.assert_frame_equal(df,
                                  df2,
                                  check_index_type=False,
                                  check_less_precise=True)