Beispiel #1
0
    def testTBX(self):
        config = {
            "a": 2,
            "b": [1, 2],
            "c": {
                "c": {
                    "D": 123
                }
            },
            "int32": np.int32(1),
            "int64": np.int64(2),
            "bool8": np.bool8(True),
            "float32": np.float32(3),
            "float64": np.float64(4),
            "bad": np.float128(4),
        }
        t = Trial(
            evaluated_params=config, trial_id="tbx", logdir=self.test_dir)
        logger = TBXLoggerCallback()
        logger.on_trial_result(0, [], t, result(0, 4))
        logger.on_trial_result(1, [], t, result(1, 5))
        logger.on_trial_result(
            2, [], t, result(2, 6, score=[1, 2, 3], hello={"world": 1}))

        logger.on_trial_complete(3, [], t)

        self._validate_tbx_result(
            params=(b"float32", b"float64", b"int32", b"int64", b"bool8"),
            excluded_params=(b"bad", ))
Beispiel #2
0
 def testBadTBX(self):
     config = {"b": (1, 2, 3)}
     t = Trial(
         evaluated_params=config, trial_id="tbx", logdir=self.test_dir)
     logger = TBXLoggerCallback()
     logger.on_trial_result(0, [], t, result(0, 4))
     logger.on_trial_result(1, [], t, result(1, 5))
     logger.on_trial_result(
         2, [], t, result(2, 6, score=[1, 2, 3], hello={"world": 1}))
     with self.assertLogs("ray.tune.logger", level="INFO") as cm:
         logger.on_trial_complete(3, [], t)
     assert "INFO" in cm.output[0]
Beispiel #3
0
    def testTBX(self):
        config = {
            "a": 2,
            "b": [1, 2],
            "c": {
                "c": {
                    "D": 123
                }
            },
            "d": np.int64(1),
            "e": np.bool8(True)
        }
        t = Trial(evaluated_params=config,
                  trial_id="tbx",
                  logdir=self.test_dir)
        logger = TBXLoggerCallback()
        logger.on_trial_result(0, [], t, result(0, 4))
        logger.on_trial_result(1, [], t, result(1, 5))
        logger.on_trial_result(
            2, [], t, result(2, 6, score=[1, 2, 3], hello={"world": 1}))

        logger.on_trial_complete(3, [], t)

        self._validate_tbx_result()
Beispiel #4
0
    from safemotions.envs.safe_motions_env import SafeMotionsEnv
    tune.register_env(env_name,
                      lambda config_args: SafeMotionsEnv(**config_args))

    ray.init(dashboard_host='0.0.0.0',
             include_dashboard=args.use_dashboard,
             ignore_reinit_error=True,
             logging_level=args.logging_level)
    config['callbacks'] = CustomTrainCallbacks

    if args.num_gpus is not None:
        config['num_gpus'] = args.num_gpus

    stop = {'time_total_s': args.time * 3600}

    experiment = {
        experiment_path: {
            'run': algorithm,
            'env': env_name,
            'stop': stop,
            'config': config,
            'checkpoint_freq': args.iterations_per_checkpoint,
            'checkpoint_at_end': True,
            'keep_checkpoints_num': 10,
            'max_failures': 0,
            'restore': checkpoint_path
        }
    }

    tune.run_experiments(experiment, callbacks=[TBXLoggerCallback()])
Beispiel #5
0
def create_default_callbacks(callbacks: Optional[List[Callback]],
                             sync_config: SyncConfig,
                             loggers: Optional[List[Logger]],
                             metric: Optional[str] = None):
    """Create default callbacks for `tune.run()`.

    This function takes a list of existing callbacks and adds default
    callbacks to it.

    Specifically, three kinds of callbacks will be added:

    1. Loggers. Ray Tune's experiment analysis relies on CSV and JSON logging.
    2. Syncer. Ray Tune synchronizes logs and checkpoint between workers and
       the head node.
    2. Trial progress reporter. For reporting intermediate progress, like trial
       results, Ray Tune uses a callback.

    These callbacks will only be added if they don't already exist, i.e. if
    they haven't been passed (and configured) by the user. A notable case
    is when a Logger is passed, which is not a CSV or JSON logger - then
    a CSV and JSON logger will still be created.

    Lastly, this function will ensure that the Syncer callback comes after all
    Logger callbacks, to ensure that the most up-to-date logs and checkpoints
    are synced across nodes.

    """
    callbacks = callbacks or []
    has_syncer_callback = False
    has_csv_logger = False
    has_json_logger = False
    has_tbx_logger = False

    has_trial_progress_callback = any(
        isinstance(c, TrialProgressCallback) for c in callbacks)

    if not has_trial_progress_callback:
        trial_progress_callback = TrialProgressCallback(metric=metric)
        callbacks.append(trial_progress_callback)

    # Track syncer obj/index to move callback after loggers
    last_logger_index = None
    syncer_index = None

    # Create LegacyLoggerCallback for passed Logger classes
    if loggers:
        # Todo(krfricke): Deprecate `loggers` argument, print warning here.
        # Add warning as soon as we ported all loggers to LoggerCallback
        # classes.
        add_loggers = []
        for trial_logger in loggers:
            if isinstance(trial_logger, LoggerCallback):
                callbacks.append(trial_logger)
            elif isinstance(trial_logger, type) and issubclass(
                    trial_logger, Logger):
                add_loggers.append(trial_logger)
            else:
                raise ValueError(
                    f"Invalid value passed to `loggers` argument of "
                    f"`tune.run()`: {trial_logger}")
        if add_loggers:
            callbacks.append(LegacyLoggerCallback(add_loggers))

    # Check if we have a CSV, JSON and TensorboardX logger
    for i, callback in enumerate(callbacks):
        if isinstance(callback, LegacyLoggerCallback):
            last_logger_index = i
            if CSVLogger in callback.logger_classes:
                has_csv_logger = True
            if JsonLogger in callback.logger_classes:
                has_json_logger = True
            if TBXLogger in callback.logger_classes:
                has_tbx_logger = True
        elif isinstance(callback, CSVLoggerCallback):
            has_csv_logger = True
            last_logger_index = i
        elif isinstance(callback, JsonLoggerCallback):
            has_json_logger = True
            last_logger_index = i
        elif isinstance(callback, TBXLoggerCallback):
            has_tbx_logger = True
            last_logger_index = i
        elif isinstance(callback, SyncerCallback):
            syncer_index = i
            has_syncer_callback = True

    # If CSV, JSON or TensorboardX loggers are missing, add
    if os.environ.get("TUNE_DISABLE_AUTO_CALLBACK_LOGGERS", "0") != "1":
        if not has_csv_logger:
            callbacks.append(CSVLoggerCallback())
            last_logger_index = len(callbacks) - 1
        if not has_json_logger:
            callbacks.append(JsonLoggerCallback())
            last_logger_index = len(callbacks) - 1
        if not has_tbx_logger:
            try:
                callbacks.append(TBXLoggerCallback())
                last_logger_index = len(callbacks) - 1
            except ImportError:
                logger.warning(
                    "The TensorboardX logger cannot be instantiated because "
                    "either TensorboardX or one of it's dependencies is not "
                    "installed. Please make sure you have the latest version "
                    "of TensorboardX installed: `pip install -U tensorboardx`")

    # If no SyncerCallback was found, add
    if not has_syncer_callback and os.environ.get(
            "TUNE_DISABLE_AUTO_CALLBACK_SYNCER", "0") != "1":

        # Detect Docker and Kubernetes environments
        _sync_to_driver = detect_sync_to_driver(sync_config.sync_to_driver)

        syncer_callback = SyncerCallback(sync_function=_sync_to_driver)
        callbacks.append(syncer_callback)
        syncer_index = len(callbacks) - 1

    if syncer_index is not None and last_logger_index is not None and \
       syncer_index < last_logger_index:
        if (not has_csv_logger or not has_json_logger or not has_tbx_logger) \
           and not loggers:
            # Only raise the warning if the loggers were passed by the user.
            # (I.e. don't warn if this was automatic behavior and they only
            # passed a customer SyncerCallback).
            raise ValueError(
                "The `SyncerCallback` you passed to `tune.run()` came before "
                "at least one `LoggerCallback`. Syncing should be done "
                "after writing logs. Please re-order the callbacks so that "
                "the `SyncerCallback` comes after any `LoggerCallback`.")
        else:
            # If these loggers were automatically created. just re-order
            # the callbacks
            syncer_obj = callbacks[syncer_index]
            callbacks.pop(syncer_index)
            callbacks.insert(last_logger_index, syncer_obj)

    return callbacks
Beispiel #6
0
from ray.tune.integration.mlflow import MLflowLoggerCallback
from ray.tune.logger import TBXLoggerCallback


def train_func():
    for i in range(3):
        session.report(dict(epoch=i))


trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=2),
    run_config=RunConfig(
        callbacks=[
            MLflowLoggerCallback(experiment_name="train_experiment"),
            TBXLoggerCallback(),
        ],
    ),
)

# Run the training function, logging all the intermediate results
# to MLflow and Tensorboard.
result = trainer.fit()

# For MLFLow logs:

# MLFlow logs will by default be saved in an `mlflow` directory
# in the current working directory.

# $ cd mlflow
# # View the MLflow UI.
Beispiel #7
0
def raytune(config, name, local, cpus, gpus, tune_result_dir, resume, ntrain,
            ntest, seeds):
    import ray
    from ray import tune
    from ray.tune.logger import TBXLoggerCallback
    from raytune.search_space import raytune_num_samples, search_space
    from raytune.utils import get_raytune_schedule, get_raytune_search_alg

    if seeds:
        # Set seeds for reproducibility
        random.seed(1234)
        np.random.seed(1234)
        tf.random.set_seed(1234)

    cfg = load_config(config)
    config_file_path = config

    if tune_result_dir is not None:
        os.environ["TUNE_RESULT_DIR"] = tune_result_dir
    else:
        if isinstance(cfg["raytune"]["local_dir"], type(None)):
            raise TypeError(
                "Please specify a local_dir in the raytune section of the config file."
            )
        trd = cfg["raytune"]["local_dir"] + "/tune_result_dir"
        os.environ["TUNE_RESULT_DIR"] = trd

    expdir = Path(cfg["raytune"]["local_dir"]) / name
    expdir.mkdir(parents=True, exist_ok=True)
    shutil.copy(
        "mlpf/raytune/search_space.py",
        str(Path(cfg["raytune"]["local_dir"]) / name / "search_space.py"
            ))  # Copy the config file to the train dir for later reference
    shutil.copy(config_file_path,
                str(Path(cfg["raytune"]["local_dir"]) / name / "config.yaml")
                )  # Copy the config file to the train dir for later reference

    ray.tune.ray_trial_executor.DEFAULT_GET_TIMEOUT = 1 * 60 * 60  # Avoid timeout errors
    if not local:
        ray.init(address="auto")

    sched = get_raytune_schedule(cfg["raytune"])
    search_alg = get_raytune_search_alg(cfg["raytune"], seeds)

    sync_config = tune.SyncConfig(sync_to_driver=False)

    start = datetime.now()
    analysis = tune.run(
        partial(build_model_and_train,
                full_config=config_file_path,
                ntrain=ntrain,
                ntest=ntest,
                name=name,
                seeds=seeds),
        config=search_space,
        resources_per_trial={
            "cpu": cpus,
            "gpu": gpus
        },
        name=name,
        scheduler=sched,
        search_alg=search_alg,
        num_samples=raytune_num_samples,
        local_dir=cfg["raytune"]["local_dir"],
        callbacks=[TBXLoggerCallback()],
        log_to_file=True,
        resume=resume,
        max_failures=2,
        sync_config=sync_config,
        stop=tune.stopper.MaximumIterationStopper(cfg["setup"]["num_epochs"]),
    )
    end = datetime.now()
    print("Total time of tune.run(...): {}".format(end - start))

    print(
        "Best hyperparameters found according to {} were: ".format(
            cfg["raytune"]["default_metric"]),
        analysis.get_best_config(cfg["raytune"]["default_metric"],
                                 cfg["raytune"]["default_mode"]),
    )

    skip = 20
    if skip > cfg["setup"]["num_epochs"]:
        skip = 0
    analysis.default_metric = cfg["raytune"]["default_metric"]
    analysis.default_mode = cfg["raytune"]["default_mode"]
    plot_ray_analysis(analysis, save=True, skip=skip)
    topk_summary_plot_v2(analysis,
                         k=5,
                         save_dir=Path(analysis.get_best_logdir()).parent)
    summarize_top_k(analysis,
                    k=5,
                    save_dir=Path(analysis.get_best_logdir()).parent)

    best_params = analysis.get_best_config(cfg["raytune"]["default_metric"],
                                           cfg["raytune"]["default_mode"])
    with open(
            Path(analysis.get_best_logdir()).parent / "best_parameters.txt",
            "a") as best_params_file:
        best_params_file.write("Best hyperparameters according to {}\n".format(
            cfg["raytune"]["default_metric"]))
        for key, val in best_params.items():
            best_params_file.write(("{}: {}\n".format(key, val)))

    with open(Path(analysis.get_best_logdir()).parent / "time.txt",
              "a") as timefile:
        timefile.write(str(end - start) + "\n")

    num_skipped = count_skipped_configurations(analysis.get_best_logdir())
    print("Number of skipped configurations: {}".format(num_skipped))
Beispiel #8
0
def create_default_callbacks(callbacks: Optional[List[Callback]],
                             sync_config: SyncConfig,
                             loggers: Optional[List[Logger]]):

    callbacks = callbacks or []
    has_syncer_callback = False
    has_csv_logger = False
    has_json_logger = False
    has_tbx_logger = False

    # Track syncer obj/index to move callback after loggers
    last_logger_index = None
    syncer_index = None

    # Create LegacyLoggerCallback for passed Logger classes
    if loggers:
        # Todo(krfricke): Deprecate `loggers` argument, print warning here.
        # Add warning as soon as we ported all loggers to LoggerCallback
        # classes.
        add_loggers = []
        for trial_logger in loggers:
            if isinstance(trial_logger, LoggerCallback):
                callbacks.append(trial_logger)
            elif isinstance(trial_logger, type) and issubclass(
                    trial_logger, Logger):
                add_loggers.append(trial_logger)
            else:
                raise ValueError(
                    f"Invalid value passed to `loggers` argument of "
                    f"`tune.run()`: {trial_logger}")
        if add_loggers:
            callbacks.append(LegacyLoggerCallback(add_loggers))

    # Check if we have a CSV, JSON and TensorboardX logger
    for i, callback in enumerate(callbacks):
        if isinstance(callback, LegacyLoggerCallback):
            last_logger_index = i
            if CSVLogger in callback.logger_classes:
                has_csv_logger = True
            if JsonLogger in callback.logger_classes:
                has_json_logger = True
            if TBXLogger in callback.logger_classes:
                has_tbx_logger = True
        elif isinstance(callback, CSVLoggerCallback):
            has_csv_logger = True
            last_logger_index = i
        elif isinstance(callback, JsonLoggerCallback):
            has_json_logger = True
            last_logger_index = i
        elif isinstance(callback, TBXLoggerCallback):
            has_tbx_logger = True
            last_logger_index = i
        elif isinstance(callback, SyncerCallback):
            syncer_index = i
            has_syncer_callback = True

    # If CSV, JSON or TensorboardX loggers are missing, add
    if os.environ.get("TUNE_DISABLE_AUTO_CALLBACK_LOGGERS", "0") != "1":
        if not has_csv_logger:
            callbacks.append(CSVLoggerCallback())
            last_logger_index = len(callbacks) - 1
        if not has_json_logger:
            callbacks.append(JsonLoggerCallback())
            last_logger_index = len(callbacks) - 1
        if not has_tbx_logger:
            callbacks.append(TBXLoggerCallback())
            last_logger_index = len(callbacks) - 1

    # If no SyncerCallback was found, add
    if not has_syncer_callback and os.environ.get(
            "TUNE_DISABLE_AUTO_CALLBACK_SYNCER", "0") != "1":

        # Detect Docker and Kubernetes environments
        _sync_to_driver = detect_sync_to_driver(sync_config.sync_to_driver)

        syncer_callback = SyncerCallback(sync_function=_sync_to_driver)
        callbacks.append(syncer_callback)
        syncer_index = len(callbacks) - 1

    if syncer_index is not None and last_logger_index is not None and \
       syncer_index < last_logger_index:
        if (not has_csv_logger or not has_json_logger or not has_tbx_logger) \
           and not loggers:
            # Only raise the warning if the loggers were passed by the user.
            # (I.e. don't warn if this was automatic behavior and they only
            # passed a customer SyncerCallback).
            raise ValueError(
                "The `SyncerCallback` you passed to `tune.run()` came before "
                "at least one `LoggerCallback`. Syncing should be done "
                "after writing logs. Please re-order the callbacks so that "
                "the `SyncerCallback` comes after any `LoggerCallback`.")
        else:
            # If these loggers were automatically created. just re-order
            # the callbacks
            syncer_obj = callbacks[syncer_index]
            callbacks.pop(syncer_index)
            callbacks.insert(last_logger_index, syncer_obj)

    return callbacks
Beispiel #9
0
    def tune_train(args,
                   model_class,
                   task_info: TaskInfo,
                   build_method=default_build_method,
                   model_kwargs: dict = None,
                   tune_config=None):
        if model_kwargs is None:
            model_kwargs = {}
        this_time = time.strftime("%m-%d_%H:%M:%S", time.localtime())
        experiment_name = f'{task_info.task_name}_{this_time}'

        if tune_config is None:
            config = {
                # 3e-4 for Small, 1e-4 for Base, 5e-5 for Large
                "lr":
                tune.loguniform(args.tune_min_lr, args.tune_max_lr),

                # -1 for disable, 0.8 for Base/Small, 0.9 for Large
                "layerwise_lr_decay_power":
                tune.choice([0.8, 0.9]),

                # lr scheduler
                "lr_scheduler":
                tune.choice([
                    'linear_schedule_with_warmup',
                    'polynomial_decay_schedule_with_warmup'
                ]),
            }
        else:
            config = tune_config
        if torch.cuda.is_available():
            resources_per_trial = {
                "cpu": args.tune_cpus_per_trial,
                "gpu": args.tune_gpus_per_trial
            }
        else:
            resources_per_trial = {"cpu": args.tune_cpus_per_trial}
        print("resources_per_trial", resources_per_trial)

        tune_dir = os.path.abspath('tune_lightning_logs')

        analysis = tune.run(
            tune.with_parameters(
                tune_train_once,
                args=args,
                task_info=task_info,
                model_class=model_class,
                build_method=build_method,
                model_kwargs=model_kwargs,
                resume=args.tune_resume,
                group=experiment_name,
                log_dir=tune_dir,
            ),
            mode="max",
            config=config,
            num_samples=args.tune_num_samples,
            metric=f'tune_{task_info.metric_name}',
            name=experiment_name,
            progress_reporter=CLIReporter(
                parameter_columns=list(config.keys()),
                metric_columns=[
                    "loss", f'tune_{task_info.metric_name}',
                    "training_iteration"
                ]),
            callbacks=[TBXLoggerCallback(),
                       CSVLoggerCallback()],
            resources_per_trial=resources_per_trial,
            scheduler=ASHAScheduler(
                max_t=args.max_epochs + 1,  # for test
                grace_period=args.min_epochs),
            queue_trials=True,
            keep_checkpoints_num=args.tune_keep_checkpoints_num,
            checkpoint_score_attr=f'tune_{task_info.metric_name}',
            local_dir=tune_dir,
        )
        print("Best hyperparameters found were: ", analysis.best_config)
        print("Best checkpoint: ", analysis.best_checkpoint)

        args_vars = vars(args)
        args_vars.update(analysis.best_config)
        model = model_class.load_from_checkpoint(os.path.join(
            analysis.best_checkpoint, "tune.ckpt"),
                                                 hparams=args,
                                                 **model_kwargs)

        pl_loggers = [
            loggers.CSVLogger(save_dir=tune.get_trial_dir(),
                              name="",
                              version="."),
            loggers.TensorBoardLogger(save_dir=tune.get_trial_dir(),
                                      name="",
                                      version=".",
                                      default_hp_metric=False),
        ]

        try:
            import wandb
            pl_loggers.append(
                loggers.WandbLogger(save_dir=tune_dir,
                                    project=args.project,
                                    name=tune.get_trial_name(),
                                    id=tune.get_trial_id(),
                                    offline=args.offline,
                                    group=experiment_name))
        except Exception:
            pass

        trainer: Trainer = Trainer.from_argparse_args(args, logger=pl_loggers)
        build_method(model, task_info)
        trainer.test(model)
Beispiel #10
0
def create_default_callbacks(
    callbacks: Optional[List[Callback]],
    sync_config: SyncConfig,
    metric: Optional[str] = None,
):
    """Create default callbacks for `tune.run()`.

    This function takes a list of existing callbacks and adds default
    callbacks to it.

    Specifically, three kinds of callbacks will be added:

    1. Loggers. Ray Tune's experiment analysis relies on CSV and JSON logging.
    2. Syncer. Ray Tune synchronizes logs and checkpoint between workers and
       the head node.
    2. Trial progress reporter. For reporting intermediate progress, like trial
       results, Ray Tune uses a callback.

    These callbacks will only be added if they don't already exist, i.e. if
    they haven't been passed (and configured) by the user. A notable case
    is when a Logger is passed, which is not a CSV or JSON logger - then
    a CSV and JSON logger will still be created.

    Lastly, this function will ensure that the Syncer callback comes after all
    Logger callbacks, to ensure that the most up-to-date logs and checkpoints
    are synced across nodes.

    """
    callbacks = callbacks or []
    has_syncer_callback = False
    has_csv_logger = False
    has_json_logger = False
    has_tbx_logger = False

    has_trial_progress_callback = any(
        isinstance(c, TrialProgressCallback) for c in callbacks)

    if not has_trial_progress_callback:
        trial_progress_callback = TrialProgressCallback(metric=metric)
        callbacks.append(trial_progress_callback)

    # Track syncer obj/index to move callback after loggers
    last_logger_index = None
    syncer_index = None

    # Check if we have a CSV, JSON and TensorboardX logger
    for i, callback in enumerate(callbacks):
        if isinstance(callback, LegacyLoggerCallback):
            if CSVLogger in callback.logger_classes:
                has_csv_logger = True
            if JsonLogger in callback.logger_classes:
                has_json_logger = True
            if TBXLogger in callback.logger_classes:
                has_tbx_logger = True
        elif isinstance(callback, CSVLoggerCallback):
            has_csv_logger = True
        elif isinstance(callback, JsonLoggerCallback):
            has_json_logger = True
        elif isinstance(callback, TBXLoggerCallback):
            has_tbx_logger = True
        elif isinstance(callback, SyncerCallback):
            syncer_index = i
            has_syncer_callback = True

        if isinstance(callback, LoggerCallback):
            last_logger_index = i

    # If CSV, JSON or TensorboardX loggers are missing, add
    if os.environ.get("TUNE_DISABLE_AUTO_CALLBACK_LOGGERS", "0") != "1":
        if not has_csv_logger:
            callbacks.append(CSVLoggerCallback())
            last_logger_index = len(callbacks) - 1
        if not has_json_logger:
            callbacks.append(JsonLoggerCallback())
            last_logger_index = len(callbacks) - 1
        if not has_tbx_logger:
            try:
                callbacks.append(TBXLoggerCallback())
                last_logger_index = len(callbacks) - 1
            except ImportError:
                logger.warning(
                    "The TensorboardX logger cannot be instantiated because "
                    "either TensorboardX or one of it's dependencies is not "
                    "installed. Please make sure you have the latest version "
                    "of TensorboardX installed: `pip install -U tensorboardx`")

    # If no SyncerCallback was found, add
    if (not has_syncer_callback and
            os.environ.get("TUNE_DISABLE_AUTO_CALLBACK_SYNCER", "0") != "1"):
        syncer_callback = SyncerCallback(enabled=bool(sync_config.syncer),
                                         sync_period=sync_config.sync_period)
        callbacks.append(syncer_callback)
        syncer_index = len(callbacks) - 1

    if (syncer_index is not None and last_logger_index is not None
            and syncer_index < last_logger_index):
        # Re-order callbacks
        syncer_obj = callbacks[syncer_index]
        callbacks.pop(syncer_index)
        callbacks.insert(last_logger_index, syncer_obj)

    return callbacks