def run_in_situ(self) -> None: """ Actually run the AzureML job; this method will typically run on an Azure VM. """ # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model # build itself, but not the tons of debug information that AzureML submissions create. logging_to_stdout(self.azure_config.log_level) suppress_logging_noise() pytest_failed = False training_failed = False pytest_passed = True # Ensure that both model training and pytest both get executed in all cases, so that we see a full set of # test results in each PR outputs_folder = self.model_config.outputs_folder try: logging_to_file(self.model_config.logs_folder / LOG_FILE_NAME) try: self.create_ml_runner().run() except Exception as ex: print_exception(ex, "Model training/testing failed.") training_failed = True if self.azure_config.pytest_mark: try: pytest_passed, results_file_path = run_pytest( self.azure_config.pytest_mark, outputs_folder) if not pytest_passed: logging.error( f"Not all PyTest tests passed. See {results_file_path}" ) except Exception as ex: print_exception(ex, "Unable to run PyTest.") pytest_failed = True finally: # wait for aggregation if required, and only if the training actually succeeded. if not training_failed and self.model_config.should_wait_for_other_cross_val_child_runs( ): self.wait_for_cross_val_runs_to_finish_and_aggregate() disable_logging_to_file() message = [] if training_failed: message.append("Training failed") if pytest_failed: message.append("Unable to run Pytest") if not pytest_passed: message.append("At least 1 test in Pytest failed") # Terminate if pytest or model training has failed. This makes the smoke test in # PR builds fail if pytest fails. if message: raise ValueError( f"One component of the training pipeline failed: {'. '.join(message)}" )
def test_logging_to_file(test_output_dirs: OutputFolderForTests) -> None: # Log file should go to a new, non-existent folder, 2 levels deep file_path = test_output_dirs.root_dir / "subdir1" / "subdir2" / "logfile.txt" common_util.logging_to_file_handler = None common_util.logging_to_file(file_path) assert common_util.logging_to_file_handler is not None log_line = "foo bar" logging.getLogger().setLevel(logging.INFO) logging.info(log_line) common_util.disable_logging_to_file() should_not_be_present = "This should not be present in logs" logging.info(should_not_be_present) assert common_util.logging_to_file_handler is None # Wait for a bit, tests sometimes fail with the file not existing yet time.sleep(2) assert file_path.exists() assert log_line in file_path.read_text() assert should_not_be_present not in file_path.read_text()
def run_in_situ(self) -> None: """ Actually run the AzureML job; this method will typically run on an Azure VM. """ # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model # build itself, but not the tons of debug information that AzureML submissions create. logging_to_stdout(self.azure_config.log_level) suppress_logging_noise() error_messages = [] # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both # only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the # large models. if self.azure_config.pytest_mark: try: outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder) if not pytest_passed: pytest_failures = f"Not all PyTest tests passed. See {results_file_path}" logging.error(pytest_failures) error_messages.append(pytest_failures) except Exception as ex: print_exception(ex, "Unable to run PyTest.") error_messages.append(f"Unable to run PyTest: {ex}") else: # Set environment variables for multi-node training if needed. # In particular, the multi-node environment variables should NOT be set in single node # training, otherwise this might lead to errors with the c10 distributed backend # (https://github.com/microsoft/InnerEye-DeepLearning/issues/395) if self.azure_config.num_nodes > 1: set_environment_variables_for_multi_node() try: logging_to_file(self.model_config.logs_folder / LOG_FILE_NAME) try: self.create_ml_runner().run() except Exception as ex: print_exception(ex, "Model training/testing failed.") error_messages.append(f"Training failed: {ex}") finally: disable_logging_to_file() # Terminate if pytest or model training has failed. This makes the smoke test in # PR builds fail if pytest fails. if error_messages: raise ValueError( f"At least one component of the runner failed: {os.linesep} {os.linesep.join(error_messages)}")