Exemple #1
0
def run(config, logger=None, local_rank=0, **kwargs):

    assert torch.cuda.is_available()
    assert (torch.backends.cudnn.enabled
            ), "Nvidia/Amp requires cudnn backend to be enabled."

    dist.init_process_group("nccl", init_method="env://")

    # As we passed config with option --manual_config_load
    assert hasattr(config, "setup"), (
        "We need to manually setup the configuration, please set --manual_config_load "
        "to py_config_runner")

    config = config.setup()

    assert_config(config, TRAINVAL_CONFIG)
    # The following attributes are automatically added by py_config_runner
    assert hasattr(config, "config_filepath") and isinstance(
        config.config_filepath, Path)
    assert hasattr(config, "script_filepath") and isinstance(
        config.script_filepath, Path)

    # dump python files to reproduce the run
    mlflow.log_artifact(config.config_filepath.as_posix())
    mlflow.log_artifact(config.script_filepath.as_posix())

    output_path = mlflow.get_artifact_uri()
    config.output_path = Path(output_path)

    if dist.get_rank() == 0:
        mlflow.log_params({
            "pytorch version": torch.__version__,
            "ignite version": ignite.__version__,
        })
        mlflow.log_params(get_params(config, TRAINVAL_CONFIG))

    try:
        training(
            config,
            local_rank=local_rank,
            with_mlflow_logging=True,
            with_plx_logging=False,
        )
    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        mlflow.log_param("Run Status", "FAILED")
        dist.destroy_process_group()
        raise e

    mlflow.log_param("Run Status", "OK")
    dist.destroy_process_group()
Exemple #2
0
def run(config, logger=None, local_rank=0, **kwargs):

    assert torch.cuda.is_available(), torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled."

    dist.init_process_group("nccl", init_method="env://")

    # As we passed config with option --manual_config_load
    assert hasattr(config, "setup"), (
        "We need to manually setup the configuration, please set --manual_config_load "
        "to py_config_runner")

    config = config.setup()

    assert_config(config, TRAINVAL_CONFIG)
    # The following attributes are automatically added by py_config_runner
    assert hasattr(config, "config_filepath") and isinstance(
        config.config_filepath, Path)
    assert hasattr(config, "script_filepath") and isinstance(
        config.script_filepath, Path)

    config.output_path = Path(get_outputs_path())

    if dist.get_rank() == 0:
        plx_exp = Experiment()
        plx_exp.log_params(
            **{
                "pytorch version": torch.__version__,
                "ignite version": ignite.__version__,
            })
        plx_exp.log_params(**get_params(config, TRAINVAL_CONFIG))

    try:
        training(config,
                 local_rank=local_rank,
                 with_mlflow_logging=False,
                 with_plx_logging=True)
    except KeyboardInterrupt:
        logger.info("Catched KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        dist.destroy_process_group()
        raise e

    dist.destroy_process_group()
Exemple #3
0
def run(config, logger=None, local_rank=0, **kwargs):

    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled."

    task = Task.init(
        "ignite", "DeeplabV3_ResNet101 pascal_voc2012 segmentation example")

    dist.init_process_group("nccl", init_method="env://")

    # As we passed config with option --manual_config_load
    assert hasattr(config, "setup"), (
        "We need to manually setup the configuration, please set --manual_config_load "
        "to py_config_runner")

    config = config.setup()

    assert_config(config, TRAINVAL_CONFIG)
    # The following attributes are automatically added by py_config_runner
    assert hasattr(config, "config_filepath") and isinstance(
        config.config_filepath, Path)
    assert hasattr(config, "script_filepath") and isinstance(
        config.script_filepath, Path)

    # dump python files to reproduce the run
    task.connect_configuration(config.config_filepath.as_posix())
    task.upload_artifact("script", config.script_filepath)

    config.output_path = Path("./artifacts")

    # log the configuration, if we are the master node
    if dist.get_rank() == 0:
        task.connect(get_params(config, TRAINVAL_CONFIG))

    try:
        training(config, local_rank=local_rank, with_trains_logging=True)
    except KeyboardInterrupt:
        logger.info("Caught KeyboardInterrupt -> exit")
    except Exception as e:  # noqa
        logger.exception("")
        dist.destroy_process_group()
        raise e

    dist.destroy_process_group()