def test_download_checkpoints(test_output_dirs: OutputFolderForTests, is_ensemble: bool, runner_config: AzureConfig) -> None: output_dir = test_output_dirs.root_dir assert get_results_blob_path("some_run_id") == "azureml/ExperimentRun/dcid.some_run_id" # Any recent run ID from a PR build will do. Use a PR build because the checkpoint files are small there. config = ModelConfigBase(should_validate=False) config.set_output_to(output_dir) runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID if is_ensemble else DEFAULT_RUN_RECOVERY_ID run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config) run_to_recover = fetch_run(workspace=runner_config.get_workspace(), run_recovery_id=runner_config.run_recovery_id) expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX if is_ensemble: child_runs = fetch_child_runs(run_to_recover) expected_files = [config.checkpoint_folder / OTHER_RUNS_SUBDIR_NAME / str(x.get_tags()['cross_validation_split_index']) / expected_checkpoint_file for x in child_runs] else: expected_files = [config.checkpoint_folder / run_to_recover.id / expected_checkpoint_file] checkpoint_paths = run_recovery.get_checkpoint_paths(1) if is_ensemble: assert len(run_recovery.checkpoints_roots) == len(expected_files) assert all([(x in [y.parent for y in expected_files]) for x in run_recovery.checkpoints_roots]) assert len(checkpoint_paths) == len(expected_files) assert all([x in expected_files for x in checkpoint_paths]) else: assert len(checkpoint_paths) == 1 assert checkpoint_paths[0] == expected_files[0] assert all([expected_file.exists() for expected_file in expected_files])
def discover_and_download_checkpoints_from_previous_runs(self) -> None: """ Download checkpoints from a run recovery object or from a weights url. Set the checkpoints path based on the run_recovery_object, weights_url or local_weights_path """ if self.azure_config.run_recovery_id: self.run_recovery = RunRecovery.download_checkpoints_from_recovery_run( self.azure_config, self.model_config, self.run_context) else: self.run_recovery = None if self.model_config.weights_url or self.model_config.local_weights_path: self.local_weights_path = self.get_and_save_modified_weights()
def test_download_checkpoints_hyperdrive_run(test_output_dirs: OutputFolderForTests, runner_config: AzureConfig) -> None: output_dir = test_output_dirs.root_dir config = ModelConfigBase(should_validate=False) config.set_output_to(output_dir) runner_config.run_recovery_id = DEFAULT_ENSEMBLE_RUN_RECOVERY_ID child_runs = fetch_child_runs(run=fetch_run(runner_config.get_workspace(), DEFAULT_ENSEMBLE_RUN_RECOVERY_ID)) # recover child runs separately also to test hyperdrive child run recovery functionality expected_checkpoint_file = "1" + CHECKPOINT_FILE_SUFFIX for child in child_runs: expected_files = [config.checkpoint_folder / child.id / expected_checkpoint_file] run_recovery = RunRecovery.download_checkpoints_from_recovery_run(runner_config, config, child) assert all([x in expected_files for x in run_recovery.get_checkpoint_paths(epoch=1)]) assert all([expected_file.exists() for expected_file in expected_files])
def run(self) -> None: """ Driver function to run a ML experiment. If an offline cross validation run is requested, then this function is recursively called for each cross validation split. """ if self.is_offline_cross_val_parent_run(): if self.model_config.is_segmentation_model: raise NotImplementedError( "Offline cross validation is only supported for classification models." ) self.spawn_offline_cross_val_classification_child_runs() return # Get the AzureML context in which the script is running if not self.model_config.is_offline_run and PARENT_RUN_CONTEXT is not None: logging.info("Setting tags from parent run.") self.set_run_tags_from_parent() self.save_build_info_for_dotnet_consumers() # Set data loader start method self.set_multiprocessing_start_method() # configure recovery container if provided run_recovery: Optional[RunRecovery] = None if self.azure_config.run_recovery_id: run_recovery = RunRecovery.download_checkpoints_from_recovery_run( self.azure_config, self.model_config, RUN_CONTEXT) # do training and inference, unless the "only register" switch is set (which requires a run_recovery # to be valid). if self.azure_config.register_model_only_for_epoch is None or run_recovery is None: # Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails # and config.local_dataset was not already set. self.model_config.local_dataset = self.mount_or_download_dataset() self.model_config.write_args_file() logging.info(str(self.model_config)) # Ensure that training runs are fully reproducible - setting random seeds alone is not enough! make_pytorch_reproducible() # Check for existing dataset.csv file in the correct locations. Skip that if a dataset has already been # loaded (typically only during tests) if self.model_config.dataset_data_frame is None: assert self.model_config.local_dataset is not None ml_util.validate_dataset_paths(self.model_config.local_dataset) # train a new model if required if self.azure_config.train: with logging_section("Model training"): model_train(self.model_config, run_recovery) else: self.model_config.write_dataset_files() self.create_activation_maps() # log the number of epochs used for model training RUN_CONTEXT.log(name="Train epochs", value=self.model_config.num_epochs) # We specify the ModelProcessing as DEFAULT here even if the run_recovery points to an ensemble run, because # the current run is a single one. See the documentation of ModelProcessing for more details. best_epoch = self.run_inference_and_register_model( run_recovery, ModelProcessing.DEFAULT) # Generate report if best_epoch: Runner.generate_report(self.model_config, best_epoch, ModelProcessing.DEFAULT) elif self.model_config.is_scalar_model and len( self.model_config.get_test_epochs()) == 1: # We don't register scalar models but still want to create a report if we have run inference. Runner.generate_report(self.model_config, self.model_config.get_test_epochs()[0], ModelProcessing.DEFAULT)