Esempio n. 1
0
 def run_in_situ(self) -> None:
     """
     Actually run the AzureML job; this method will typically run on an Azure VM.
     """
     # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model
     # build itself, but not the tons of debug information that AzureML submissions create.
     logging_to_stdout(self.azure_config.log_level)
     suppress_logging_noise()
     pytest_failed = False
     training_failed = False
     pytest_passed = True
     # Ensure that both model training and pytest both get executed in all cases, so that we see a full set of
     # test results in each PR
     outputs_folder = self.model_config.outputs_folder
     try:
         logging_to_file(self.model_config.logs_folder / LOG_FILE_NAME)
         try:
             self.create_ml_runner().run()
         except Exception as ex:
             print_exception(ex, "Model training/testing failed.")
             training_failed = True
         if self.azure_config.pytest_mark:
             try:
                 pytest_passed, results_file_path = run_pytest(
                     self.azure_config.pytest_mark, outputs_folder)
                 if not pytest_passed:
                     logging.error(
                         f"Not all PyTest tests passed. See {results_file_path}"
                     )
             except Exception as ex:
                 print_exception(ex, "Unable to run PyTest.")
                 pytest_failed = True
     finally:
         # wait for aggregation if required, and only if the training actually succeeded.
         if not training_failed and self.model_config.should_wait_for_other_cross_val_child_runs(
         ):
             self.wait_for_cross_val_runs_to_finish_and_aggregate()
         disable_logging_to_file()
     message = []
     if training_failed:
         message.append("Training failed")
     if pytest_failed:
         message.append("Unable to run Pytest")
     if not pytest_passed:
         message.append("At least 1 test in Pytest failed")
     # Terminate if pytest or model training has failed. This makes the smoke test in
     # PR builds fail if pytest fails.
     if message:
         raise ValueError(
             f"One component of the training pipeline failed: {'. '.join(message)}"
         )
Esempio n. 2
0
def test_logging_to_file(test_output_dirs: OutputFolderForTests) -> None:
    # Log file should go to a new, non-existent folder, 2 levels deep
    file_path = test_output_dirs.root_dir / "subdir1" / "subdir2" / "logfile.txt"
    common_util.logging_to_file_handler = None
    common_util.logging_to_file(file_path)
    assert common_util.logging_to_file_handler is not None
    log_line = "foo bar"
    logging.getLogger().setLevel(logging.INFO)
    logging.info(log_line)
    common_util.disable_logging_to_file()
    should_not_be_present = "This should not be present in logs"
    logging.info(should_not_be_present)
    assert common_util.logging_to_file_handler is None
    # Wait for a bit, tests sometimes fail with the file not existing yet
    time.sleep(2)
    assert file_path.exists()
    assert log_line in file_path.read_text()
    assert should_not_be_present not in file_path.read_text()
Esempio n. 3
0
 def run_in_situ(self) -> None:
     """
     Actually run the AzureML job; this method will typically run on an Azure VM.
     """
     # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model
     # build itself, but not the tons of debug information that AzureML submissions create.
     logging_to_stdout(self.azure_config.log_level)
     suppress_logging_noise()
     error_messages = []
     # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both
     # only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the
     # large models.
     if self.azure_config.pytest_mark:
         try:
             outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR
             pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder)
             if not pytest_passed:
                 pytest_failures = f"Not all PyTest tests passed. See {results_file_path}"
                 logging.error(pytest_failures)
                 error_messages.append(pytest_failures)
         except Exception as ex:
             print_exception(ex, "Unable to run PyTest.")
             error_messages.append(f"Unable to run PyTest: {ex}")
     else:
         # Set environment variables for multi-node training if needed.
         # In particular, the multi-node environment variables should NOT be set in single node
         # training, otherwise this might lead to errors with the c10 distributed backend
         # (https://github.com/microsoft/InnerEye-DeepLearning/issues/395)
         if self.azure_config.num_nodes > 1:
             set_environment_variables_for_multi_node()
         try:
             logging_to_file(self.model_config.logs_folder / LOG_FILE_NAME)
             try:
                 self.create_ml_runner().run()
             except Exception as ex:
                 print_exception(ex, "Model training/testing failed.")
                 error_messages.append(f"Training failed: {ex}")
         finally:
             disable_logging_to_file()
     # Terminate if pytest or model training has failed. This makes the smoke test in
     # PR builds fail if pytest fails.
     if error_messages:
         raise ValueError(
             f"At least one component of the runner failed: {os.linesep} {os.linesep.join(error_messages)}")
Esempio n. 4
0
 def run_in_situ(self) -> None:
     """
     Actually run the AzureML job; this method will typically run on an Azure VM.
     """
     # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model
     # build itself, but not the tons of debug information that AzureML submissions create.
     # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable
     logging_to_stdout(self.azure_config.log_level if is_local_rank_zero() else "ERROR")
     suppress_logging_noise()
     if is_global_rank_zero():
         self.print_git_tags()
     # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both
     # only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the
     # large models.
     if self.azure_config.pytest_mark:
         outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR
         pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder)
         if not pytest_passed:
             # Terminate if pytest has failed. This makes the smoke test in
             # PR builds fail if pytest fails.
             pytest_failures = f"Not all PyTest tests passed. See {results_file_path}"
             raise ValueError(pytest_failures)
     else:
         # Set environment variables for multi-node training if needed.
         # In particular, the multi-node environment variables should NOT be set in single node
         # training, otherwise this might lead to errors with the c10 distributed backend
         # (https://github.com/microsoft/InnerEye-DeepLearning/issues/395)
         if self.azure_config.num_nodes > 1:
             set_environment_variables_for_multi_node()
         ml_runner = self.create_ml_runner()
         ml_runner.setup()
         ml_runner.start_logging_to_file()
         try:
             ml_runner.run()
         finally:
             disable_logging_to_file()