def get_default_azure_config() -> AzureConfig: """ Gets the Azure-related configuration options, using the default settings file settings.yaml. """ return AzureConfig.from_yaml( yaml_file_path=fixed_paths.SETTINGS_YAML_FILE, project_root=fixed_paths.repository_root_directory())
def test_submit_for_inference(test_output_dirs: OutputFolderForTests) -> None: """ Execute the submit_for_inference script on the model that was recently trained. This starts an AzureML job, and downloads the segmentation. Then check if the segmentation was actually produced. :return: """ model = get_most_recent_model() image_file = fixed_paths_for_tests.full_ml_test_data_path( ) / "train_and_test_data" / "id1_channel1.nii.gz" assert image_file.exists(), f"Image file not found: {image_file}" settings_file = fixed_paths.SETTINGS_YAML_FILE assert settings_file.exists(), f"Settings file not found: {settings_file}" azure_config = AzureConfig.from_yaml( settings_file, project_root=fixed_paths.repository_root_directory()) # Read the name of the branch from environment, so that the inference experiment is also listed alongside # all other AzureML runs that belong to the current PR. build_branch = os.environ.get("BUILD_BRANCH", None) experiment_name = to_azure_friendly_string( build_branch) if build_branch else "model_inference" azure_config.get_git_information() args = [ "--image_file", str(image_file), "--model_id", model.id, "--settings", str(settings_file), "--download_folder", str(test_output_dirs.root_dir), "--cluster", "training-nc12", "--experiment", experiment_name ] seg_path = test_output_dirs.root_dir / DEFAULT_RESULT_IMAGE_NAME assert not seg_path.exists( ), f"Result file {seg_path} should not yet exist" submit_for_inference.main( args, project_root=fixed_paths.repository_root_directory()) assert seg_path.exists(), f"Result file {seg_path} was not created"
def azure_config(self) -> AzureConfig: """ Gets the AzureConfig instance that the script uses. :return: """ if self._azure_config is None: self._azure_config = AzureConfig.from_yaml(Path(self.train_yaml_path)) return self._azure_config
def azure_config(self) -> AzureConfig: """ Gets the AzureConfig instance that the script uses. :return: """ if self._azure_config is None: self._azure_config = AzureConfig.from_yaml(self.settings_yaml_file, project_root=self.project_root) return self._azure_config
def report_structure_extremes(dataset_dir: str, yaml_file: str) -> None: """ Writes structure-extreme lines for the subjects in a directory. If there are any structures with missing slices, a ValueError is raised after writing all the lines. This allows a build failure to be triggered when such structures exist. :param yaml_file: The path to the YAML file that contains all Azure-related options. :param dataset_dir: directory containing subject subdirectories with integer names. """ azure_config = AzureConfig.from_yaml(yaml_file_path=Path(yaml_file)) download_dataset_directory(azure_config, dataset_dir) subjects: Set[int] = set() series_map = None institution_map = None for subj in os.listdir(dataset_dir): try: subjects.add(int(subj)) except ValueError: if subj == "dataset.csv": # We should find this in every dataset_dir. series_map, institution_map = populate_series_maps(os.path.join(dataset_dir, subj)) pass if institution_map is None or series_map is None: raise FileNotFoundError(f"Cannot find {dataset_dir}/dataset.csv") if not subjects: print(f"No subject directories found in {dataset_dir}") return print(f"Found {len(subjects)} subjects in {dataset_dir}") # You could temporarily edit subjects to be an explicit list of integers here, to process only certain subjects: # subjects = [23, 42, 99] full_output_dir = os.path.join(dataset_dir, "structure_extremes_full") os.makedirs(full_output_dir) problems_output_dir = os.path.join(dataset_dir, "structure_extremes_problems") os.makedirs(problems_output_dir) n_missing = 0 files_created: Set[str] = set() for (index, subj_int) in enumerate(sorted(subjects), 1): subj = str(subj_int) institution_id = institution_map.get(subj, "") out = open_with_header(os.path.join(full_output_dir, institution_id + ".txt"), files_created) err = None for line in report_structure_extremes_for_subject(os.path.join(dataset_dir, subj), series_map[subj]): out.write(line + "\n") if line.find(MISSING_SLICE_MARKER) > 0: if err is None: err = open_with_header(os.path.join(problems_output_dir, institution_id + ".txt"), files_created) err.write(line + "\n") n_missing += 1 out.close() if err is not None: err.close() if index % 25 == 0: print(f"Processed {index} subjects") print(f"Processed all {len(subjects)} subjects") upload_to_dataset_directory(azure_config, dataset_dir, files_created) # If we found any structures with missing slices, raise an exception, which should be # uncaught where necessary to make any appropriate build step fail. if n_missing > 0: raise ValueError(f"Found {n_missing} structures with missing slices")
def azure_config(self) -> AzureConfig: """ Gets the AzureConfig instance that the script uses. This will either read out a value that has previously been set, or create a new AzureConfig object from the YAML file and project root settings that the present object holds. """ if self._azure_config is None: self._azure_config = AzureConfig.from_yaml( self.settings_yaml_file, project_root=self.project_root) return self._azure_config
def main(settings_yaml_file: Optional[Path] = None, project_root: Optional[Path] = None) -> None: """ Main function. """ logging_to_stdout() config = ReportStructureExtremesConfig.parse_args() azure_config = AzureConfig.from_yaml(yaml_file_path=settings_yaml_file or config.settings, project_root=project_root) report_structure_extremes(config.dataset, azure_config)
def get_most_recent_model() -> Model: most_recent_run = get_most_recent_run() azure_config = AzureConfig.from_yaml( fixed_paths.SETTINGS_YAML_FILE, project_root=fixed_paths.repository_root_directory()) workspace = azure_config.get_workspace() run = fetch_run(workspace, most_recent_run) tags = run.get_tags() model_id = tags.get(MODEL_ID_KEY_NAME, None) assert model_id, f"No model_id tag was found on run {most_recent_run}" return Model(workspace=workspace, id=model_id)
def main(args: Optional[List[str]] = None, project_root: Optional[Path] = None) -> None: """ Main function. """ logging_to_stdout() inference_config = SubmitForInferenceConfig.parse_args(args) settings = inference_config.settings or fixed_paths.SETTINGS_YAML_FILE azure_config = AzureConfig.from_yaml(settings, project_root=project_root) if inference_config.cluster: azure_config.cluster = inference_config.cluster submit_for_inference(inference_config, azure_config)
def main(settings_yaml_file: Optional[Path] = None, project_root: Optional[Path] = None) -> None: """ Parses the commandline arguments, and based on those, starts the Tensorboard monitoring for the AzureML runs supplied on the commandline. :param settings_yaml_file: The YAML file that contains all information for accessing Azure. :param project_root: The root folder that contains all code for the present run. This is only used to locate a private settings file InnerEyePrivateSettings.yml. """ monitor_config = AMLTensorBoardMonitorConfig.parse_args() settings_yaml_file = settings_yaml_file or monitor_config.settings monitor(monitor_config=monitor_config, azure_config=AzureConfig.from_yaml(settings_yaml_file, project_root=project_root))
def submit_for_inference(args: SubmitForInferenceConfig) -> Optional[Path]: """ Create and submit an inference to AzureML, and optionally download the resulting segmentation. :param args: configuration, see SubmitForInferenceConfig :return: path to downloaded segmentation on local disc, or None if none. """ logging.info(f"Building Azure configuration from {args.yaml_file}") azure_config = AzureConfig.from_yaml(args.yaml_file) logging.info("Getting workspace") workspace = azure_config.get_workspace() logging.info("Identifying model") model = Model(workspace=workspace, id=args.model_id) model_id = model.id logging.info(f"Identified model {model_id}") source_directory = tempfile.TemporaryDirectory() source_directory_name = source_directory.name logging.info(f"Building inference run submission in {source_directory_name}") source_directory_path = Path(source_directory_name) copy_image_file(args.image_file, source_directory_path / DEFAULT_DATA_FOLDER) # We copy over run_scoring.py, and score.py as well in case the model we're using # does not have sufficiently recent versions of those files. for base in ["run_scoring.py", "score.py"]: shutil.copyfile(base, str(source_directory_path / base)) source_config = SourceConfig( root_folder=source_directory_name, entry_script=str(source_directory_path / "run_scoring.py"), script_params={"--data-folder": ".", "--spawnprocess": "python", "--model-id": model_id, "score.py": ""}, conda_dependencies_files=download_conda_dependency_files(model, source_directory_path) ) estimator = create_estimator_from_configs(workspace, azure_config, source_config, []) exp = Experiment(workspace=workspace, name=args.experiment_name) run = exp.submit(estimator) logging.info(f"Submitted run {run.id} in experiment {run.experiment.name}") logging.info(f"Run URL: {run.get_portal_url()}") if not args.keep_upload_folder: source_directory.cleanup() logging.info(f"Deleted submission directory {source_directory_name}") if args.download_folder is None: return None logging.info("Awaiting run completion") run.wait_for_completion() logging.info(f"Run has completed with status {run.get_status()}") download_path = choose_download_path(args.download_folder) logging.info(f"Attempting to download segmentation to {download_path}") run.download_file(DEFAULT_RESULT_IMAGE_NAME, str(download_path)) if download_path.exists(): logging.info(f"Downloaded segmentation to {download_path}") else: logging.warning("Segmentation NOT downloaded") return download_path
def get_most_recent_model_id(fallback_run_id_for_local_execution: str = FALLBACK_SINGLE_RUN) -> str: """ Gets the string name of the most recently executed AzureML run, extracts which model that run had registered, and return the model id. :param fallback_run_id_for_local_execution: A hardcoded AzureML run ID that is used when executing this code on a local box, outside of Azure build agents. """ most_recent_run = get_most_recent_run_id(fallback_run_id_for_local_execution=fallback_run_id_for_local_execution) azure_config = AzureConfig.from_yaml(fixed_paths.SETTINGS_YAML_FILE, project_root=fixed_paths.repository_root_directory()) run = azure_config.fetch_run(most_recent_run) assert run.status == "Completed", f"AzureML run {run.id} did not complete successfully." tags = run.get_tags() model_id = tags.get(MODEL_ID_KEY_NAME, None) assert model_id, f"No model_id tag was found on run {most_recent_run}" return model_id
def test_get_comparison_data(test_output_dirs: OutputFolderForTests) -> None: """ Check that metrics.csv and dataset.csv are created after the second epoch, if running on Azure. """ most_recent_run = get_most_recent_run() azure_config = AzureConfig.from_yaml( fixed_paths.SETTINGS_YAML_FILE, project_root=fixed_paths.repository_root_directory()) workspace = azure_config.get_workspace() run = fetch_run(workspace, most_recent_run) blob_path = get_epoch_results_path(2, ModelExecutionMode.TEST) (comparison_dataset_path, comparison_metrics_path) = get_comparison_baseline_paths( test_output_dirs.root_dir, blob_path, run, DATASET_CSV_FILE_NAME) assert comparison_dataset_path is not None assert comparison_metrics_path is not None
def test_git_info_from_commandline() -> None: """ Test if git branch information can be overriden on the commandline """ azure_config = AzureConfig.from_yaml(fixed_paths.SETTINGS_YAML_FILE) azure_config.project_root = project_root azure_config.build_branch = "branch" azure_config.build_source_id = "id" azure_config.build_source_author = "author" azure_config.build_source_message = "message" azure_config.build_source_repository = "repo" source_info = azure_config.get_git_information() assert source_info.branch == "branch" assert source_info.commit_id == "id" assert source_info.commit_author == "author" assert source_info.commit_message == "message" assert source_info.repository == "repo"
def get_most_recent_model( fallback_run_id_for_local_execution: str = FALLBACK_SINGLE_RUN ) -> Model: """ Gets the string name of the most recently executed AzureML run, extracts which model that run had registered, and return the instantiated model object. :param fallback_run_id_for_local_execution: A hardcoded AzureML run ID that is used when executing this code on a local box, outside of Azure build agents. """ most_recent_run = get_most_recent_run_id( fallback_run_id_for_local_execution=fallback_run_id_for_local_execution ) azure_config = AzureConfig.from_yaml( fixed_paths.SETTINGS_YAML_FILE, project_root=fixed_paths.repository_root_directory()) run = azure_config.fetch_run(most_recent_run) tags = run.get_tags() model_id = tags.get(MODEL_ID_KEY_NAME, None) assert model_id, f"No model_id tag was found on run {most_recent_run}" return Model(workspace=azure_config.get_workspace(), id=model_id)
def test_git_info() -> None: """ Test if git branch information can be read correctly. """ logging_to_stdout(log_level=logging.DEBUG) azure_config = AzureConfig.from_yaml(fixed_paths.SETTINGS_YAML_FILE) azure_config.project_root = project_root assert azure_config.build_branch == "" assert azure_config.build_source_id == "" assert azure_config.build_source_author == "" assert azure_config.build_source_message == "" assert azure_config.build_source_repository == "" source_info = azure_config.get_git_information() assert source_info.repository == azure_config.project_root.name # We can't access the branch name when this test runs on the build agents, because the repositories # are checked out in "detached head" state. Hence, can't assert on branch name in any way # that works locally and in the cloud. assert len(source_info.commit_id) == 40 assert len(source_info.commit_author) > 0 assert len(source_info.commit_message) > 0
def test_download_and_upload(model_id: str, test_output_dirs: OutputFolderForTests) -> None: """ Test that downloads and uploads a model to a workspace """ azure_config = AzureConfig.from_yaml( yaml_file_path=fixed_paths.SETTINGS_YAML_FILE, project_root=fixed_paths.repository_root_directory()) ws = azure_config.get_workspace() config_download = MoveModelConfig(model_id=model_id, path=str(test_output_dirs.root_dir), action="download") move(ws, config_download) assert (test_output_dirs.root_dir / model_id.replace(":", "_")).is_dir() config_upload = MoveModelConfig(model_id=model_id, path=str(test_output_dirs.root_dir), action="upload") model = move(ws, config_upload) assert model is not None assert PYTHON_ENVIRONMENT_NAME in model.tags assert model.description != ""
def create_datafactory_and_run(files_and_tokens: Dict[str, str], connection_string: str, location: str, is_unittest: bool = False) -> None: """ Builds an Azure Data Factory to download the FastMRI dataset from AWS, and places them in Azure Blob Storage. :param location: The Azure location in which the Data Factory should be created (for example, "westeurope") :param files_and_tokens: A mapping from file name (like knee.tar.gz) to AWS access token. :param is_unittest: If True, download a small tar.gz file from github. If False, download the "real" fastMRI datafiles from AWS. :param connection_string: The connection string of the Azure storage where the downloaded data should be stored. """ azure_config = AzureConfig.from_yaml( yaml_file_path=fixed_paths.SETTINGS_YAML_FILE, project_root=fixed_paths.repository_root_directory()) # The data factory name. It must be globally unique. data_factory_name = "fastmri-copy-data-" + uuid.uuid4().hex[:8] # Get either the Service Principal authentication, if those are set already, or use interactive auth in the browser azureid_auth = get_azure_auth(azure_config) # Create a data factory adf_client = DataFactoryManagementClient(azureid_auth, azure_config.subscription_id) df_resource = Factory(location=location) print(f"Creating data factory {data_factory_name}") df = adf_client.factories.create_or_update(azure_config.resource_group, data_factory_name, df_resource) while df.provisioning_state != 'Succeeded': df = adf_client.factories.get(azure_config.resource_group, data_factory_name) time.sleep(1) print("Data factory created") # Create a linked service pointing to where the downloads come from if is_unittest: http_service = LinkedServiceResource(properties=HttpLinkedService( url="https://github.com", enable_server_certificate_validation=True, authentication_type="Anonymous")) else: http_service = LinkedServiceResource(properties=HttpLinkedService( url="https://fastmri-dataset.s3.amazonaws.com/", enable_server_certificate_validation=True, authentication_type="Anonymous")) http_name = "AwsHttp" adf_client.linked_services.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, linked_service_name=http_name, linked_service=http_service) # Create a linked service that represents the sink (Azure blob storage) blob_storage_name = "AzureBlob" blob_storage = AzureBlobStorageLinkedService( connection_string=SecureString(value=connection_string)) blob_storage_service = LinkedServiceResource(properties=blob_storage) adf_client.linked_services.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, linked_service_name=blob_storage_name, linked_service=blob_storage_service) linked_blob_storage = LinkedServiceReference( reference_name=blob_storage_name) linked_http = LinkedServiceReference(reference_name=http_name) def download_and_uncompress(source_file_or_tuple: Union[str, Tuple[str, str]], target_folder: str) -> List[str]: """ Downloads a file from AWS and stores them in blob storage in its compressed form. From the compressed file in blob storage, it is then uncompressed, and written to a new folder in blob storage. For example, if 'target_folder' is 'foo', the uncompressed file will be written to folder 'foo', and the compressed raw data will be written to 'foo_compressed'. :param source_file_or_tuple: The name of the .tar.gz or .tar file to download, without any access tokens. If the name is a Tuple[str, str], the second tuple element is the "real" extension, for files where the extension is misleading. :param target_folder: The folder prefix in the target storage account. :return: A list of pipelines that this method created. """ if isinstance(source_file_or_tuple, str): source_file = source_file_or_tuple file_extension = "".join(Path(source_file).suffixes) correct_extension = file_extension elif isinstance(source_file_or_tuple, tuple): source_file, correct_extension = source_file_or_tuple file_extension = "".join(Path(source_file).suffixes) else: raise ValueError( f"Type of source_file_or_tuple not recognized: {type(source_file_or_tuple)}" ) source_file_with_correct_extension = source_file[:source_file.rfind( file_extension)] + correct_extension target_folder_compressed = target_folder + COMPRESSED_DATASET_SUFFIX if is_unittest: http_source = HttpServerLocation( relative_url="gulpjs/gulp/archive/v3.9.1.tar.gz") else: http_source = HttpServerLocation( relative_url=f"{source_file}{files_and_tokens[source_file]}") source_file_cleaned = source_file.replace(".", "_") # A dataset that reads the files from AWS as-is, no decompression source_compressed = BinaryDataset(linked_service_name=linked_http, location=http_source) source_compressed_name = f"{source_file_cleaned} on AWS" adf_client.datasets.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, dataset_name=source_compressed_name, dataset=DatasetResource(properties=source_compressed)) # The sink for downloading the datasets as-is (compressed) blob_storage_compressed = AzureBlobStorageLocation( file_name=source_file_with_correct_extension, container=TARGET_CONTAINER, folder_path=target_folder_compressed) dest_compressed = BinaryDataset( linked_service_name=linked_blob_storage, location=blob_storage_compressed) dest_compressed_name = f"{source_file_cleaned} on Azure" adf_client.datasets.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, dataset_name=dest_compressed_name, dataset=DatasetResource(properties=dest_compressed)) # A dataset that reads the files from blob storage and uncompresses on-the-fly if correct_extension == ".tar.gz": compression = DatasetTarGZipCompression() # By default, a folder gets created for each .tar.gzip file that is read. Disable that. compression_properties = TarGZipReadSettings( preserve_compression_file_name_as_folder=False) elif correct_extension == ".tar": compression = DatasetTarCompression() # By default, a folder gets created for each .tar file that is read. Disable that. compression_properties = TarReadSettings( preserve_compression_file_name_as_folder=False) else: raise ValueError( f"Unable to determine compression for file {source_file}") source_uncompressed = BinaryDataset( linked_service_name=linked_blob_storage, location=blob_storage_compressed, compression=compression) source_uncompressed_name = f"read {source_file_cleaned} and uncompress" adf_client.datasets.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, dataset_name=source_uncompressed_name, dataset=DatasetResource(properties=source_uncompressed)) # The sink for downloading the datasets uncompressed final_dataset = BinaryDataset(linked_service_name=linked_blob_storage, location=AzureBlobStorageLocation( container=TARGET_CONTAINER, folder_path=target_folder)) final_name = f"save {source_file_cleaned} uncompressed" adf_client.datasets.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, dataset_name=final_name, dataset=DatasetResource(properties=final_dataset)) # Copying from compressed source to compressed destination on blob storage download = CopyActivity( name=f"download {source_file_cleaned}", inputs=[DatasetReference(reference_name=source_compressed_name)], outputs=[DatasetReference(reference_name=dest_compressed_name)], source=HttpSource(), sink=BlobSink()) # Read the compressed file from blob storage and create an uncompressed dataset. # This should not create extra folder structure beyond what is already in the tar file - this is specified # in compression_properties binary_source = BinarySource(format_settings=BinaryReadSettings( compression_properties=compression_properties)) uncompress = CopyActivity( name=f"uncompress {source_file_cleaned}", inputs=[DatasetReference(reference_name=source_uncompressed_name)], outputs=[DatasetReference(reference_name=final_name)], source=binary_source, sink=BlobSink(), # Add a dependent activity: We first need to download depends_on=[ ActivityDependency(activity=download.name, dependency_conditions=["Succeeded"]) ]) # Create a pipeline that first downloads from AWS to blob storage, and then decompresses from blob storage # to another blob storage location pipeline = f"{source_file_cleaned} to folder {target_folder}" adf_client.pipelines.create_or_update( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, pipeline_name=pipeline, pipeline=PipelineResource(activities=[download, uncompress])) return [pipeline] file_list: FolderAndFileList = \ [("antonsctest", ["foo.tar.gz", "bar.tar"])] if is_unittest else files_to_download all_pipelines = [] print("Creating pipelines:") for target_folder, files in file_list: for file in files: pipelines = download_and_uncompress(file, target_folder=target_folder) for p in pipelines: print(f"Created pipeline {p}") all_pipelines.extend(pipelines) print("Starting all pipelines") run_ids_per_pipeline = {} for pipeline in all_pipelines: run_result = adf_client.pipelines.create_run( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, pipeline_name=pipeline) print(f"Started pipeline: {pipeline}") run_ids_per_pipeline[run_result.run_id] = pipeline print("Waiting for pipelines to complete") status_per_run = { run_id: "running" for run_id in run_ids_per_pipeline.keys() } while True: for run_id in run_ids_per_pipeline.keys(): if status_per_run[run_id]: pipeline_run = adf_client.pipeline_runs.get( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, run_id=run_id) status = pipeline_run.status if status == "Succeeded" or status == "Failed": print( f"Pipeline '{run_ids_per_pipeline[run_id]}' completed with status {status}" ) status_per_run[run_id] = "" else: status_per_run[run_id] = status remaining_runs = len([v for v in status_per_run.values() if v]) print(f"Remaining pipelines that are running: {remaining_runs}") if remaining_runs == 0: break time.sleep(30) utcnow = datetime.now(timezone.utc) filter_params = RunFilterParameters( last_updated_after=utcnow - timedelta(days=1), last_updated_before=utcnow + timedelta(days=1)) for run_id, pipeline in run_ids_per_pipeline.items(): query_response = adf_client.activity_runs.query_by_pipeline_run( resource_group_name=azure_config.resource_group, factory_name=data_factory_name, run_id=run_id, filter_parameters=filter_params) run_status = query_response.value[0] print(f"Status for pipeline {pipeline}: {run_status.status}") if run_status.status == 'Succeeded': print(f"\tNumber of bytes read: {run_status.output['dataRead']}") print( f"\tNumber of bytes written: {run_status.output['dataWritten']}" ) print(f"\tCopy duration: {run_status.output['copyDuration']}") else: print(f"\tErrors: {run_status.error['message']}") print("All pipelines completed. Deleting data factory.") adf_client.factories.delete(azure_config.resource_group, data_factory_name)