Beispiel #1
0
def test_download_blobxfer(test_output_dirs: TestOutputDirectories, is_file: bool, runner_config: AzureConfig) -> None:
    """
    Test for a bug in early versions of download_blobs: download is happening via prefixes, but because of
    stripping leading directory names, blobs got overwritten.
    """
    root = Path(test_output_dirs.root_dir)
    account_key = runner_config.get_dataset_storage_account_key()
    assert account_key is not None
    # Expected test data in Azure blobs:
    # folder1/folder1.txt with content "folder1.txt"
    # folder1_with_suffix/folder2.txt with content "folder2.txt"
    # folder1_with_suffix/folder1.txt with content "this comes from folder2"
    # with bug present, folder1_with_suffix/folder1.txt will overwrite folder1/folder1.txt
    blobs_root_path = "data-for-testsuite/folder1"
    if is_file:
        blobs_root_path += "/folder1.txt"
    download_blobs(runner_config.datasets_storage_account, account_key, blobs_root_path, root, is_file)

    folder1 = root / "folder1.txt"
    assert folder1.exists()
    if not is_file:
        otherfile = root / "otherfile.txt"
        folder2 = root / "folder2.txt"
        assert folder1.read_text().strip() == "folder1.txt"
        assert otherfile.exists()
        assert otherfile.read_text().strip() == "folder1.txt"
        assert not folder2.exists()
def upload_to_dataset_directory(azure_config: AzureConfig, dataset_dir: str, files: Set[str]) -> None:
    if not files:
        return
    account_key = azure_config.get_dataset_storage_account_key()
    block_blob_service = BlockBlobService(account_name=azure_config.datasets_storage_account, account_key=account_key)
    container_name = os.path.join(azure_config.datasets_container, os.path.basename(dataset_dir))
    for path in files:
        blob_name = path[len(dataset_dir) + 1:]
        block_blob_service.create_blob_from_path(container_name, blob_name, path)
        print(f"Uploaded {path} to {azure_config.datasets_storage_account}:{container_name}/{blob_name}")
def download_dataset_directory(azure_config: AzureConfig, dataset_dir: str) -> bool:
    if os.path.isdir(dataset_dir):
        return False
    account_key = azure_config.get_dataset_storage_account_key()
    blobs_root_path = os.path.join(azure_config.datasets_container, os.path.basename(dataset_dir)) + "/"
    sys.stdout.write(f"Downloading data to {dataset_dir} ...")
    assert account_key is not None  # for mypy
    download_blobs(azure_config.datasets_storage_account, account_key, blobs_root_path, Path(dataset_dir))
    sys.stdout.write("done\n")
    return True
Beispiel #4
0
def download_dataset_via_blobxfer(dataset_id: str, azure_config: AzureConfig,
                                  target_folder: Path) -> Optional[Path]:
    """
    Attempts to downloads a dataset from the Azure storage account for datasets, with download happening via
    blobxfer. This is only possible if the datasets storage account and keyword are present in the `azure_config`.
    The function returns None if the required settings were not present.
    :param dataset_id: The folder of the dataset, expected in the container given by azure_config.datasets_container.
    :param azure_config: The object with all Azure-related settings.
    :param target_folder: The local folder into which the dataset should be downloaded.
    :return: The folder that contains the downloaded dataset. Returns None if the datasets account name or password
    were not present.
    """
    datasets_account_key = azure_config.get_dataset_storage_account_key()
    if not datasets_account_key:
        logging.info(
            "No account key for the dataset storage account was found.")
        logging.info(
            f"We checked in environment variables and in the file {PROJECT_SECRETS_FILE}"
        )
        return None
    if (not azure_config.datasets_container) or (
            not azure_config.datasets_storage_account):
        logging.info("Datasets storage account or container missing.")
        return None
    target_folder.mkdir(exist_ok=True)
    result_folder = target_folder / dataset_id
    # only download if hasn't already been downloaded
    if result_folder.is_dir():
        logging.info(
            f"Folder already exists, skipping download: {result_folder}")
        return result_folder
    with logging_section(f"Downloading dataset {dataset_id}"):
        download_blobs(
            account=azure_config.datasets_storage_account,
            account_key=datasets_account_key,
            # When specifying the blobs root path, ensure that there is a slash at the end, otherwise
            # all datasets with that dataset_id as a prefix get downloaded.
            blobs_root_path=f"{azure_config.datasets_container}/{dataset_id}/",
            destination=result_folder)
    return result_folder