def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available() assert (torch.backends.cudnn.enabled ), "Nvidia/Amp requires cudnn backend to be enabled." dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), ( "We need to manually setup the configuration, please set --manual_config_load " "to py_config_runner") config = config.setup() assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) # dump python files to reproduce the run mlflow.log_artifact(config.config_filepath.as_posix()) mlflow.log_artifact(config.script_filepath.as_posix()) output_path = mlflow.get_artifact_uri() config.output_path = Path(output_path) if dist.get_rank() == 0: mlflow.log_params({ "pytorch version": torch.__version__, "ignite version": ignite.__version__, }) mlflow.log_params(get_params(config, TRAINVAL_CONFIG)) try: training( config, local_rank=local_rank, with_mlflow_logging=True, with_plx_logging=False, ) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") mlflow.log_param("Run Status", "FAILED") dist.destroy_process_group() raise e mlflow.log_param("Run Status", "OK") dist.destroy_process_group()
def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), ( "We need to manually setup the configuration, please set --manual_config_load " "to py_config_runner") config = config.setup() assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) config.output_path = Path(get_outputs_path()) if dist.get_rank() == 0: plx_exp = Experiment() plx_exp.log_params( **{ "pytorch version": torch.__version__, "ignite version": ignite.__version__, }) plx_exp.log_params(**get_params(config, TRAINVAL_CONFIG)) try: training(config, local_rank=local_rank, with_mlflow_logging=False, with_plx_logging=True) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") dist.destroy_process_group() raise e dist.destroy_process_group()
def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." task = Task.init( "ignite", "DeeplabV3_ResNet101 pascal_voc2012 segmentation example") dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), ( "We need to manually setup the configuration, please set --manual_config_load " "to py_config_runner") config = config.setup() assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) # dump python files to reproduce the run task.connect_configuration(config.config_filepath.as_posix()) task.upload_artifact("script", config.script_filepath) config.output_path = Path("./artifacts") # log the configuration, if we are the master node if dist.get_rank() == 0: task.connect(get_params(config, TRAINVAL_CONFIG)) try: training(config, local_rank=local_rank, with_trains_logging=True) except KeyboardInterrupt: logger.info("Caught KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") dist.destroy_process_group() raise e dist.destroy_process_group()