コード例 #1
0
def download_dataset(azure_dataset_id: str,
                     target_folder: Path,
                     dataset_csv: str,
                     azure_config: AzureConfig) -> Path:
    """
    Downloads or checks for an existing dataset on the executing machine. If a local_dataset is supplied and the
    directory is present, return that. Otherwise, download the dataset specified by the azure_dataset_id from the
    AzureML dataset attached to the given AzureML workspace. The dataset is downloaded into the `target_folder`,
    in a subfolder that has the same name as the dataset. If there already appears to be such a folder, and the folder
    contains a dataset csv file, no download is started.
    :param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace.
    :param target_folder: The folder in which to download the dataset from Azure.
    :param dataset_csv: Name of the csv file describing the dataset.
    :param azure_config: All Azure-related configuration options.
    :return: A path on the local machine that contains the dataset.
    """
    logging.info("Trying to download dataset via AzureML datastore now.")
    azure_dataset = get_or_create_dataset(azure_config, azure_dataset_id)
    if not isinstance(azure_dataset, FileDataset):
        raise ValueError(f"Expected to get a FileDataset, but got {type(azure_dataset)}")
    # The downloaded dataset may already exist from a previous run.
    expected_dataset_path = target_folder / azure_dataset_id
    expected_dataset_file = expected_dataset_path / dataset_csv
    logging.info(f"Model training will use dataset '{azure_dataset_id}' in Azure.")
    if expected_dataset_path.is_dir() and expected_dataset_file.is_file():
        logging.info(f"The dataset appears to be downloaded already in {expected_dataset_path}. Skipping.")
        return expected_dataset_path
    logging.info("Starting to download the dataset - WARNING, this could take very long!")
    with logging_section("Downloading dataset"):
        t0 = time.perf_counter()
        azure_dataset.download(target_path=str(expected_dataset_path), overwrite=False)
        t1 = time.perf_counter() - t0
        logging.info(f"Azure dataset '{azure_dataset_id}' downloaded in {t1} seconds")
    logging.info(f"Azure dataset '{azure_dataset_id}' is now available in {expected_dataset_path}")
    return expected_dataset_path
コード例 #2
0
def download_dataset(azure_dataset_id: str, target_folder: Path,
                     azure_config: AzureConfig) -> Path:
    """
    Downloads or checks for an existing dataset on the executing machine. If a local_dataset is supplied and the
    directory is present, return that. Otherwise, download the dataset specified by the azure_dataset_id from the
    AzureML dataset attached to the given AzureML workspace. The dataset is downloaded into the `target_folder`,
    in a subfolder that has the same name as the dataset. If there already appears to be such a folder, and the folder
    contains a dataset.csv file, no download is started.
    :param local_dataset: The path to an existing local dataset.
    :param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace.
    :param target_folder: The folder in which to download the dataset from Azure.
    :param azure_config: All Azure-related configuration options.
    :return: A path on the local machine that contains the dataset.
    """
    workspace = azure_config.get_workspace()
    try:
        downloaded_via_blobxfer = download_dataset_via_blobxfer(
            dataset_id=azure_dataset_id,
            azure_config=azure_config,
            target_folder=target_folder)
        if downloaded_via_blobxfer:
            return downloaded_via_blobxfer
    except Exception as ex:
        print_exception(ex, message="Unable to download dataset via blobxfer.")
    logging.info("Trying to download dataset via AzureML datastore now.")
    azure_dataset = get_or_create_dataset(workspace, azure_dataset_id)
    if not isinstance(azure_dataset, FileDataset):
        raise ValueError(
            f"Expected to get a FileDataset, but got {type(azure_dataset)}")
    # The downloaded dataset may already exist from a previous run.
    expected_dataset_path = target_folder / azure_dataset_id
    expected_dataset_file = expected_dataset_path / DATASET_CSV_FILE_NAME
    logging.info(
        f"Model training will use dataset '{azure_dataset_id}' in Azure.")
    if expected_dataset_path.is_dir() and expected_dataset_file.is_file():
        logging.info(
            f"The dataset appears to be downloaded already in {expected_dataset_path}. Skipping."
        )
        return expected_dataset_path
    logging.info(
        "Starting to download the dataset - WARNING, this could take very long!"
    )
    with logging_section("Downloading dataset"):
        azure_dataset.download(target_path=str(expected_dataset_path),
                               overwrite=False)
    logging.info(
        f"Azure dataset '{azure_dataset_id}' is now available in {expected_dataset_path}"
    )
    return expected_dataset_path