Esempi in Python per AzureConfig.from_yaml, esempi in Python per InnerEye.Azure.azure_config.AzureConfig.from_yaml

Esempio n. 1

0

Mostra file

def get_default_azure_config() -> AzureConfig:
    """
    Gets the Azure-related configuration options, using the default settings file settings.yaml.
    """
    return AzureConfig.from_yaml(
        yaml_file_path=fixed_paths.SETTINGS_YAML_FILE,
        project_root=fixed_paths.repository_root_directory())

Esempio n. 2

0

Mostra file

File: test_after_training.py Progetto: ujjwalk/InnerEye-DeepLearning

def test_submit_for_inference(test_output_dirs: OutputFolderForTests) -> None:
    """
    Execute the submit_for_inference script on the model that was recently trained. This starts an AzureML job,
    and downloads the segmentation. Then check if the segmentation was actually produced.
    :return:
    """
    model = get_most_recent_model()
    image_file = fixed_paths_for_tests.full_ml_test_data_path(
    ) / "train_and_test_data" / "id1_channel1.nii.gz"
    assert image_file.exists(), f"Image file not found: {image_file}"
    settings_file = fixed_paths.SETTINGS_YAML_FILE
    assert settings_file.exists(), f"Settings file not found: {settings_file}"
    azure_config = AzureConfig.from_yaml(
        settings_file, project_root=fixed_paths.repository_root_directory())
    # Read the name of the branch from environment, so that the inference experiment is also listed alongside
    # all other AzureML runs that belong to the current PR.
    build_branch = os.environ.get("BUILD_BRANCH", None)
    experiment_name = to_azure_friendly_string(
        build_branch) if build_branch else "model_inference"
    azure_config.get_git_information()
    args = [
        "--image_file",
        str(image_file), "--model_id", model.id, "--settings",
        str(settings_file), "--download_folder",
        str(test_output_dirs.root_dir), "--cluster", "training-nc12",
        "--experiment", experiment_name
    ]
    seg_path = test_output_dirs.root_dir / DEFAULT_RESULT_IMAGE_NAME
    assert not seg_path.exists(
    ), f"Result file {seg_path} should not yet exist"
    submit_for_inference.main(
        args, project_root=fixed_paths.repository_root_directory())
    assert seg_path.exists(), f"Result file {seg_path} was not created"

Esempio n. 3

0

Mostra file

File: plot_cross_validation.py Progetto: zhiliangpersonal/InnerEye-DeepLearning

 def azure_config(self) -> AzureConfig:
     """
     Gets the AzureConfig instance that the script uses.
     :return:
     """
     if self._azure_config is None:
         self._azure_config = AzureConfig.from_yaml(Path(self.train_yaml_path))
     return self._azure_config

Esempio n. 4

0

Mostra file

 def azure_config(self) -> AzureConfig:
     """
     Gets the AzureConfig instance that the script uses.
     :return:
     """
     if self._azure_config is None:
         self._azure_config = AzureConfig.from_yaml(self.settings_yaml_file, project_root=self.project_root)
     return self._azure_config

Esempio n. 5

0

Mostra file

File: report_structure_extremes.py Progetto: zhiliangpersonal/InnerEye-DeepLearning

def report_structure_extremes(dataset_dir: str, yaml_file: str) -> None:
    """
    Writes structure-extreme lines for the subjects in a directory.
    If there are any structures with missing slices, a ValueError is raised after writing all the lines.
    This allows a build failure to be triggered when such structures exist.
    :param yaml_file: The path to the YAML file that contains all Azure-related options.
    :param dataset_dir: directory containing subject subdirectories with integer names.
    """
    azure_config = AzureConfig.from_yaml(yaml_file_path=Path(yaml_file))
    download_dataset_directory(azure_config, dataset_dir)
    subjects: Set[int] = set()
    series_map = None
    institution_map = None
    for subj in os.listdir(dataset_dir):
        try:
            subjects.add(int(subj))
        except ValueError:
            if subj == "dataset.csv":
                # We should find this in every dataset_dir.
                series_map, institution_map = populate_series_maps(os.path.join(dataset_dir, subj))
            pass
    if institution_map is None or series_map is None:
        raise FileNotFoundError(f"Cannot find {dataset_dir}/dataset.csv")
    if not subjects:
        print(f"No subject directories found in {dataset_dir}")
        return
    print(f"Found {len(subjects)} subjects in {dataset_dir}")
    # You could temporarily edit subjects to be an explicit list of integers here, to process only certain subjects:
    # subjects = [23, 42, 99]
    full_output_dir = os.path.join(dataset_dir, "structure_extremes_full")
    os.makedirs(full_output_dir)
    problems_output_dir = os.path.join(dataset_dir, "structure_extremes_problems")
    os.makedirs(problems_output_dir)
    n_missing = 0
    files_created: Set[str] = set()
    for (index, subj_int) in enumerate(sorted(subjects), 1):
        subj = str(subj_int)
        institution_id = institution_map.get(subj, "")
        out = open_with_header(os.path.join(full_output_dir, institution_id + ".txt"), files_created)
        err = None
        for line in report_structure_extremes_for_subject(os.path.join(dataset_dir, subj), series_map[subj]):
            out.write(line + "\n")
            if line.find(MISSING_SLICE_MARKER) > 0:
                if err is None:
                    err = open_with_header(os.path.join(problems_output_dir, institution_id + ".txt"), files_created)
                err.write(line + "\n")
                n_missing += 1
        out.close()
        if err is not None:
            err.close()
        if index % 25 == 0:
            print(f"Processed {index} subjects")
    print(f"Processed all {len(subjects)} subjects")
    upload_to_dataset_directory(azure_config, dataset_dir, files_created)
    # If we found any structures with missing slices, raise an exception, which should be
    # uncaught where necessary to make any appropriate build step fail.
    if n_missing > 0:
        raise ValueError(f"Found {n_missing} structures with missing slices")

Esempio n. 6

0

Mostra file

File: plot_cross_validation.py Progetto: mmachua/InnerEye-DeepLearning

 def azure_config(self) -> AzureConfig:
     """
     Gets the AzureConfig instance that the script uses. This will either read out a value that has
     previously been set, or create a new AzureConfig object from the YAML file and project root settings that
     the present object holds.
     """
     if self._azure_config is None:
         self._azure_config = AzureConfig.from_yaml(
             self.settings_yaml_file, project_root=self.project_root)
     return self._azure_config

Esempio n. 7

0

Mostra file

def main(settings_yaml_file: Optional[Path] = None,
         project_root: Optional[Path] = None) -> None:
    """
    Main function.
    """
    logging_to_stdout()
    config = ReportStructureExtremesConfig.parse_args()
    azure_config = AzureConfig.from_yaml(yaml_file_path=settings_yaml_file or config.settings,
                                         project_root=project_root)
    report_structure_extremes(config.dataset, azure_config)

Esempio n. 8

0

Mostra file

File: test_after_training.py Progetto: ujjwalk/InnerEye-DeepLearning

def get_most_recent_model() -> Model:
    most_recent_run = get_most_recent_run()
    azure_config = AzureConfig.from_yaml(
        fixed_paths.SETTINGS_YAML_FILE,
        project_root=fixed_paths.repository_root_directory())
    workspace = azure_config.get_workspace()
    run = fetch_run(workspace, most_recent_run)
    tags = run.get_tags()
    model_id = tags.get(MODEL_ID_KEY_NAME, None)
    assert model_id, f"No model_id tag was found on run {most_recent_run}"
    return Model(workspace=workspace, id=model_id)

Esempio n. 9

0

Mostra file

File: submit_for_inference.py Progetto: gechunqiang/InnerEye-DeepLearning

def main(args: Optional[List[str]] = None, project_root: Optional[Path] = None) -> None:
    """
    Main function.
    """
    logging_to_stdout()
    inference_config = SubmitForInferenceConfig.parse_args(args)
    settings = inference_config.settings or fixed_paths.SETTINGS_YAML_FILE
    azure_config = AzureConfig.from_yaml(settings, project_root=project_root)
    if inference_config.cluster:
        azure_config.cluster = inference_config.cluster
    submit_for_inference(inference_config, azure_config)

Esempio n. 10

0

Mostra file

File: tensorboard_monitor.py Progetto: gechunqiang/InnerEye-DeepLearning

def main(settings_yaml_file: Optional[Path] = None,
         project_root: Optional[Path] = None) -> None:
    """
    Parses the commandline arguments, and based on those, starts the Tensorboard monitoring for the AzureML runs
    supplied on the commandline.
    :param settings_yaml_file: The YAML file that contains all information for accessing Azure.
    :param project_root: The root folder that contains all code for the present run. This is only used to locate
    a private settings file InnerEyePrivateSettings.yml.
    """
    monitor_config = AMLTensorBoardMonitorConfig.parse_args()
    settings_yaml_file = settings_yaml_file or monitor_config.settings
    monitor(monitor_config=monitor_config,
            azure_config=AzureConfig.from_yaml(settings_yaml_file, project_root=project_root))

Esempio n. 11

0

Mostra file

def submit_for_inference(args: SubmitForInferenceConfig) -> Optional[Path]:
    """
    Create and submit an inference to AzureML, and optionally download the resulting segmentation.
    :param args: configuration, see SubmitForInferenceConfig
    :return: path to downloaded segmentation on local disc, or None if none.
    """
    logging.info(f"Building Azure configuration from {args.yaml_file}")
    azure_config = AzureConfig.from_yaml(args.yaml_file)
    logging.info("Getting workspace")
    workspace = azure_config.get_workspace()
    logging.info("Identifying model")
    model = Model(workspace=workspace, id=args.model_id)
    model_id = model.id
    logging.info(f"Identified model {model_id}")
    source_directory = tempfile.TemporaryDirectory()
    source_directory_name = source_directory.name
    logging.info(f"Building inference run submission in {source_directory_name}")
    source_directory_path = Path(source_directory_name)
    copy_image_file(args.image_file, source_directory_path / DEFAULT_DATA_FOLDER)
    # We copy over run_scoring.py, and score.py as well in case the model we're using
    # does not have sufficiently recent versions of those files.
    for base in ["run_scoring.py", "score.py"]:
        shutil.copyfile(base, str(source_directory_path / base))
    source_config = SourceConfig(
        root_folder=source_directory_name,
        entry_script=str(source_directory_path / "run_scoring.py"),
        script_params={"--data-folder": ".", "--spawnprocess": "python",
                       "--model-id": model_id, "score.py": ""},
        conda_dependencies_files=download_conda_dependency_files(model, source_directory_path)
    )
    estimator = create_estimator_from_configs(workspace, azure_config, source_config, [])
    exp = Experiment(workspace=workspace, name=args.experiment_name)
    run = exp.submit(estimator)
    logging.info(f"Submitted run {run.id} in experiment {run.experiment.name}")
    logging.info(f"Run URL: {run.get_portal_url()}")
    if not args.keep_upload_folder:
        source_directory.cleanup()
        logging.info(f"Deleted submission directory {source_directory_name}")
    if args.download_folder is None:
        return None
    logging.info("Awaiting run completion")
    run.wait_for_completion()
    logging.info(f"Run has completed with status {run.get_status()}")
    download_path = choose_download_path(args.download_folder)
    logging.info(f"Attempting to download segmentation to {download_path}")
    run.download_file(DEFAULT_RESULT_IMAGE_NAME, str(download_path))
    if download_path.exists():
        logging.info(f"Downloaded segmentation to {download_path}")
    else:
        logging.warning("Segmentation NOT downloaded")
    return download_path

Esempio n. 12

0

Mostra file

File: test_after_training.py Progetto: JacopoTeneggi/InnerEye-DeepLearning

def get_most_recent_model_id(fallback_run_id_for_local_execution: str = FALLBACK_SINGLE_RUN) -> str:
    """
    Gets the string name of the most recently executed AzureML run, extracts which model that run had registered,
    and return the model id.
    :param fallback_run_id_for_local_execution: A hardcoded AzureML run ID that is used when executing this code
    on a local box, outside of Azure build agents.
    """
    most_recent_run = get_most_recent_run_id(fallback_run_id_for_local_execution=fallback_run_id_for_local_execution)
    azure_config = AzureConfig.from_yaml(fixed_paths.SETTINGS_YAML_FILE,
                                         project_root=fixed_paths.repository_root_directory())
    run = azure_config.fetch_run(most_recent_run)
    assert run.status == "Completed", f"AzureML run {run.id} did not complete successfully."
    tags = run.get_tags()
    model_id = tags.get(MODEL_ID_KEY_NAME, None)
    assert model_id, f"No model_id tag was found on run {most_recent_run}"
    return model_id

Esempio n. 13

0

Mostra file

def test_get_comparison_data(test_output_dirs: OutputFolderForTests) -> None:
    """
    Check that metrics.csv and dataset.csv are created after the second epoch, if running on Azure.
    """
    most_recent_run = get_most_recent_run()
    azure_config = AzureConfig.from_yaml(
        fixed_paths.SETTINGS_YAML_FILE,
        project_root=fixed_paths.repository_root_directory())
    workspace = azure_config.get_workspace()
    run = fetch_run(workspace, most_recent_run)
    blob_path = get_epoch_results_path(2, ModelExecutionMode.TEST)
    (comparison_dataset_path,
     comparison_metrics_path) = get_comparison_baseline_paths(
         test_output_dirs.root_dir, blob_path, run, DATASET_CSV_FILE_NAME)
    assert comparison_dataset_path is not None
    assert comparison_metrics_path is not None

Esempio n. 14

0

Mostra file

def test_git_info_from_commandline() -> None:
    """
    Test if git branch information can be overriden on the commandline
    """
    azure_config = AzureConfig.from_yaml(fixed_paths.SETTINGS_YAML_FILE)
    azure_config.project_root = project_root
    azure_config.build_branch = "branch"
    azure_config.build_source_id = "id"
    azure_config.build_source_author = "author"
    azure_config.build_source_message = "message"
    azure_config.build_source_repository = "repo"
    source_info = azure_config.get_git_information()
    assert source_info.branch == "branch"
    assert source_info.commit_id == "id"
    assert source_info.commit_author == "author"
    assert source_info.commit_message == "message"
    assert source_info.repository == "repo"

Esempio n. 15

0

Mostra file

def get_most_recent_model(
        fallback_run_id_for_local_execution: str = FALLBACK_SINGLE_RUN
) -> Model:
    """
    Gets the string name of the most recently executed AzureML run, extracts which model that run had registered,
    and return the instantiated model object.
    :param fallback_run_id_for_local_execution: A hardcoded AzureML run ID that is used when executing this code
    on a local box, outside of Azure build agents.
    """
    most_recent_run = get_most_recent_run_id(
        fallback_run_id_for_local_execution=fallback_run_id_for_local_execution
    )
    azure_config = AzureConfig.from_yaml(
        fixed_paths.SETTINGS_YAML_FILE,
        project_root=fixed_paths.repository_root_directory())
    run = azure_config.fetch_run(most_recent_run)
    tags = run.get_tags()
    model_id = tags.get(MODEL_ID_KEY_NAME, None)
    assert model_id, f"No model_id tag was found on run {most_recent_run}"
    return Model(workspace=azure_config.get_workspace(), id=model_id)

Esempio n. 16

0

Mostra file

def test_git_info() -> None:
    """
    Test if git branch information can be read correctly.
    """
    logging_to_stdout(log_level=logging.DEBUG)
    azure_config = AzureConfig.from_yaml(fixed_paths.SETTINGS_YAML_FILE)
    azure_config.project_root = project_root
    assert azure_config.build_branch == ""
    assert azure_config.build_source_id == ""
    assert azure_config.build_source_author == ""
    assert azure_config.build_source_message == ""
    assert azure_config.build_source_repository == ""
    source_info = azure_config.get_git_information()
    assert source_info.repository == azure_config.project_root.name
    # We can't access the branch name when this test runs on the build agents, because the repositories
    # are checked out in "detached head" state. Hence, can't assert on branch name in any way
    # that works locally and in the cloud.
    assert len(source_info.commit_id) == 40
    assert len(source_info.commit_author) > 0
    assert len(source_info.commit_message) > 0

Esempio n. 17

0

Mostra file

def test_download_and_upload(model_id: str,
                             test_output_dirs: OutputFolderForTests) -> None:
    """
    Test that downloads and uploads a model to a workspace
    """
    azure_config = AzureConfig.from_yaml(
        yaml_file_path=fixed_paths.SETTINGS_YAML_FILE,
        project_root=fixed_paths.repository_root_directory())
    ws = azure_config.get_workspace()
    config_download = MoveModelConfig(model_id=model_id,
                                      path=str(test_output_dirs.root_dir),
                                      action="download")
    move(ws, config_download)
    assert (test_output_dirs.root_dir / model_id.replace(":", "_")).is_dir()
    config_upload = MoveModelConfig(model_id=model_id,
                                    path=str(test_output_dirs.root_dir),
                                    action="upload")
    model = move(ws, config_upload)
    assert model is not None
    assert PYTHON_ENVIRONMENT_NAME in model.tags
    assert model.description != ""

Esempio n. 18

0

Mostra file

def create_datafactory_and_run(files_and_tokens: Dict[str, str],
                               connection_string: str,
                               location: str,
                               is_unittest: bool = False) -> None:
    """
    Builds an Azure Data Factory to download the FastMRI dataset from AWS, and places them in Azure Blob Storage.
    :param location: The Azure location in which the Data Factory should be created (for example, "westeurope")
    :param files_and_tokens: A mapping from file name (like knee.tar.gz) to AWS access token.
    :param is_unittest: If True, download a small tar.gz file from github. If False, download the "real" fastMRI
    datafiles from AWS.
    :param connection_string: The connection string of the Azure storage where the downloaded data should be stored.
    """

    azure_config = AzureConfig.from_yaml(
        yaml_file_path=fixed_paths.SETTINGS_YAML_FILE,
        project_root=fixed_paths.repository_root_directory())

    # The data factory name. It must be globally unique.
    data_factory_name = "fastmri-copy-data-" + uuid.uuid4().hex[:8]

    # Get either the Service Principal authentication, if those are set already, or use interactive auth in the browser
    azureid_auth = get_azure_auth(azure_config)

    # Create a data factory
    adf_client = DataFactoryManagementClient(azureid_auth,
                                             azure_config.subscription_id)
    df_resource = Factory(location=location)
    print(f"Creating data factory {data_factory_name}")
    df = adf_client.factories.create_or_update(azure_config.resource_group,
                                               data_factory_name, df_resource)
    while df.provisioning_state != 'Succeeded':
        df = adf_client.factories.get(azure_config.resource_group,
                                      data_factory_name)
        time.sleep(1)
    print("Data factory created")

    # Create a linked service pointing to where the downloads come from
    if is_unittest:
        http_service = LinkedServiceResource(properties=HttpLinkedService(
            url="https://github.com",
            enable_server_certificate_validation=True,
            authentication_type="Anonymous"))
    else:
        http_service = LinkedServiceResource(properties=HttpLinkedService(
            url="https://fastmri-dataset.s3.amazonaws.com/",
            enable_server_certificate_validation=True,
            authentication_type="Anonymous"))
    http_name = "AwsHttp"
    adf_client.linked_services.create_or_update(
        resource_group_name=azure_config.resource_group,
        factory_name=data_factory_name,
        linked_service_name=http_name,
        linked_service=http_service)
    # Create a linked service that represents the sink (Azure blob storage)
    blob_storage_name = "AzureBlob"
    blob_storage = AzureBlobStorageLinkedService(
        connection_string=SecureString(value=connection_string))
    blob_storage_service = LinkedServiceResource(properties=blob_storage)
    adf_client.linked_services.create_or_update(
        resource_group_name=azure_config.resource_group,
        factory_name=data_factory_name,
        linked_service_name=blob_storage_name,
        linked_service=blob_storage_service)

    linked_blob_storage = LinkedServiceReference(
        reference_name=blob_storage_name)
    linked_http = LinkedServiceReference(reference_name=http_name)

    def download_and_uncompress(source_file_or_tuple: Union[str, Tuple[str,
                                                                       str]],
                                target_folder: str) -> List[str]:
        """
        Downloads a file from AWS and stores them in blob storage in its compressed form.
        From the compressed file in blob storage, it is then uncompressed, and written to a new folder in blob storage.
        For example, if 'target_folder' is 'foo', the uncompressed file will be written to folder 'foo', and the
        compressed raw data will be written to 'foo_compressed'.
        :param source_file_or_tuple: The name of the .tar.gz or .tar file to download, without any access tokens.
        If the name is a Tuple[str, str], the second tuple element is the "real" extension, for files where the
        extension is misleading.
        :param target_folder: The folder prefix in the target storage account.
        :return: A list of pipelines that this method created.
        """
        if isinstance(source_file_or_tuple, str):
            source_file = source_file_or_tuple
            file_extension = "".join(Path(source_file).suffixes)
            correct_extension = file_extension
        elif isinstance(source_file_or_tuple, tuple):
            source_file, correct_extension = source_file_or_tuple
            file_extension = "".join(Path(source_file).suffixes)
        else:
            raise ValueError(
                f"Type of source_file_or_tuple not recognized: {type(source_file_or_tuple)}"
            )
        source_file_with_correct_extension = source_file[:source_file.rfind(
            file_extension)] + correct_extension
        target_folder_compressed = target_folder + COMPRESSED_DATASET_SUFFIX
        if is_unittest:
            http_source = HttpServerLocation(
                relative_url="gulpjs/gulp/archive/v3.9.1.tar.gz")
        else:
            http_source = HttpServerLocation(
                relative_url=f"{source_file}{files_and_tokens[source_file]}")
        source_file_cleaned = source_file.replace(".", "_")
        # A dataset that reads the files from AWS as-is, no decompression
        source_compressed = BinaryDataset(linked_service_name=linked_http,
                                          location=http_source)
        source_compressed_name = f"{source_file_cleaned} on AWS"
        adf_client.datasets.create_or_update(
            resource_group_name=azure_config.resource_group,
            factory_name=data_factory_name,
            dataset_name=source_compressed_name,
            dataset=DatasetResource(properties=source_compressed))
        # The sink for downloading the datasets as-is (compressed)
        blob_storage_compressed = AzureBlobStorageLocation(
            file_name=source_file_with_correct_extension,
            container=TARGET_CONTAINER,
            folder_path=target_folder_compressed)
        dest_compressed = BinaryDataset(
            linked_service_name=linked_blob_storage,
            location=blob_storage_compressed)
        dest_compressed_name = f"{source_file_cleaned} on Azure"
        adf_client.datasets.create_or_update(
            resource_group_name=azure_config.resource_group,
            factory_name=data_factory_name,
            dataset_name=dest_compressed_name,
            dataset=DatasetResource(properties=dest_compressed))
        # A dataset that reads the files from blob storage and uncompresses on-the-fly
        if correct_extension == ".tar.gz":
            compression = DatasetTarGZipCompression()
            # By default, a folder gets created for each .tar.gzip file that is read. Disable that.
            compression_properties = TarGZipReadSettings(
                preserve_compression_file_name_as_folder=False)
        elif correct_extension == ".tar":
            compression = DatasetTarCompression()
            # By default, a folder gets created for each .tar file that is read. Disable that.
            compression_properties = TarReadSettings(
                preserve_compression_file_name_as_folder=False)
        else:
            raise ValueError(
                f"Unable to determine compression for file {source_file}")
        source_uncompressed = BinaryDataset(
            linked_service_name=linked_blob_storage,
            location=blob_storage_compressed,
            compression=compression)
        source_uncompressed_name = f"read {source_file_cleaned} and uncompress"
        adf_client.datasets.create_or_update(
            resource_group_name=azure_config.resource_group,
            factory_name=data_factory_name,
            dataset_name=source_uncompressed_name,
            dataset=DatasetResource(properties=source_uncompressed))
        # The sink for downloading the datasets uncompressed
        final_dataset = BinaryDataset(linked_service_name=linked_blob_storage,
                                      location=AzureBlobStorageLocation(
                                          container=TARGET_CONTAINER,
                                          folder_path=target_folder))
        final_name = f"save {source_file_cleaned} uncompressed"
        adf_client.datasets.create_or_update(
            resource_group_name=azure_config.resource_group,
            factory_name=data_factory_name,
            dataset_name=final_name,
            dataset=DatasetResource(properties=final_dataset))
        # Copying from compressed source to compressed destination on blob storage
        download = CopyActivity(
            name=f"download {source_file_cleaned}",
            inputs=[DatasetReference(reference_name=source_compressed_name)],
            outputs=[DatasetReference(reference_name=dest_compressed_name)],
            source=HttpSource(),
            sink=BlobSink())
        # Read the compressed file from blob storage and create an uncompressed dataset.
        # This should not create extra folder structure beyond what is already in the tar file - this is specified
        # in compression_properties
        binary_source = BinarySource(format_settings=BinaryReadSettings(
            compression_properties=compression_properties))
        uncompress = CopyActivity(
            name=f"uncompress {source_file_cleaned}",
            inputs=[DatasetReference(reference_name=source_uncompressed_name)],
            outputs=[DatasetReference(reference_name=final_name)],
            source=binary_source,
            sink=BlobSink(),
            # Add a dependent activity: We first need to download
            depends_on=[
                ActivityDependency(activity=download.name,
                                   dependency_conditions=["Succeeded"])
            ])
        # Create a pipeline that first downloads from AWS to blob storage, and then decompresses from blob storage
        # to another blob storage location
        pipeline = f"{source_file_cleaned} to folder {target_folder}"
        adf_client.pipelines.create_or_update(
            resource_group_name=azure_config.resource_group,
            factory_name=data_factory_name,
            pipeline_name=pipeline,
            pipeline=PipelineResource(activities=[download, uncompress]))
        return [pipeline]

    file_list: FolderAndFileList = \
        [("antonsctest", ["foo.tar.gz", "bar.tar"])] if is_unittest else files_to_download
    all_pipelines = []
    print("Creating pipelines:")
    for target_folder, files in file_list:
        for file in files:
            pipelines = download_and_uncompress(file,
                                                target_folder=target_folder)
            for p in pipelines:
                print(f"Created pipeline {p}")
            all_pipelines.extend(pipelines)

    print("Starting all pipelines")
    run_ids_per_pipeline = {}
    for pipeline in all_pipelines:
        run_result = adf_client.pipelines.create_run(
            resource_group_name=azure_config.resource_group,
            factory_name=data_factory_name,
            pipeline_name=pipeline)
        print(f"Started pipeline: {pipeline}")
        run_ids_per_pipeline[run_result.run_id] = pipeline

    print("Waiting for pipelines to complete")
    status_per_run = {
        run_id: "running"
        for run_id in run_ids_per_pipeline.keys()
    }
    while True:
        for run_id in run_ids_per_pipeline.keys():
            if status_per_run[run_id]:
                pipeline_run = adf_client.pipeline_runs.get(
                    resource_group_name=azure_config.resource_group,
                    factory_name=data_factory_name,
                    run_id=run_id)
                status = pipeline_run.status
                if status == "Succeeded" or status == "Failed":
                    print(
                        f"Pipeline '{run_ids_per_pipeline[run_id]}' completed with status {status}"
                    )
                    status_per_run[run_id] = ""
                else:
                    status_per_run[run_id] = status
        remaining_runs = len([v for v in status_per_run.values() if v])
        print(f"Remaining pipelines that are running: {remaining_runs}")
        if remaining_runs == 0:
            break
        time.sleep(30)

    utcnow = datetime.now(timezone.utc)
    filter_params = RunFilterParameters(
        last_updated_after=utcnow - timedelta(days=1),
        last_updated_before=utcnow + timedelta(days=1))
    for run_id, pipeline in run_ids_per_pipeline.items():
        query_response = adf_client.activity_runs.query_by_pipeline_run(
            resource_group_name=azure_config.resource_group,
            factory_name=data_factory_name,
            run_id=run_id,
            filter_parameters=filter_params)
        run_status = query_response.value[0]
        print(f"Status for pipeline {pipeline}: {run_status.status}")
        if run_status.status == 'Succeeded':
            print(f"\tNumber of bytes read: {run_status.output['dataRead']}")
            print(
                f"\tNumber of bytes written: {run_status.output['dataWritten']}"
            )
            print(f"\tCopy duration: {run_status.output['copyDuration']}")
        else:
            print(f"\tErrors: {run_status.error['message']}")

    print("All pipelines completed. Deleting data factory.")
    adf_client.factories.delete(azure_config.resource_group, data_factory_name)