Example #1
0
def tune_example(num_workers=1, use_gpu=False, use_fp16=False,
                 test_mode=False):
    TorchTrainable = TorchTrainer.as_trainable(
        model_creator=ResNet18,
        data_creator=cifar_creator,
        optimizer_creator=optimizer_creator,
        loss_creator=nn.CrossEntropyLoss,
        scheduler_creator=scheduler_creator,
        initialization_hook=initialization_hook,
        num_workers=num_workers,
        config={
            "test_mode": test_mode,  # user-defined param to subset the data
            BATCH_SIZE: 128 * num_workers,
        },
        use_gpu=use_gpu,
        scheduler_step_freq="epoch",
        use_fp16=use_fp16)

    pbt_scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="val_loss",
        mode="min",
        perturbation_interval=1,
        hyperparam_mutations={
            # distribution for resampling
            "lr": lambda: np.random.uniform(0.001, 1),
            # allow perturbations within this set of categorical values
            "momentum": [0.8, 0.9, 0.99],
        })

    reporter = CLIReporter()
    reporter.add_metric_column("val_loss", "loss")
    reporter.add_metric_column("val_accuracy", "acc")

    analysis = tune.run(
        TorchTrainable,
        num_samples=4,
        config={
            "lr": tune.choice([0.001, 0.01, 0.1]),
            "momentum": 0.8
        },
        stop={"training_iteration": 2 if test_mode else 100},
        max_failures=3,  # used for fault tolerance
        checkpoint_freq=3,  # used for fault tolerance
        keep_checkpoints_num=1,  # used for fault tolerance
        verbose=2,
        progress_reporter=reporter,
        scheduler=pbt_scheduler)

    return analysis.get_best_config(metric="val_loss", mode="min")
def main():
    config = Config(config_file="lunar/config/config_lunar.yaml").config

    ray.init(address="auto")

    space = {
        "agent_learn_every_x_steps":
        hp.choice("agent_learn_every_x_steps", [10, 20]),
        "replay_buffer_batch_size":
        hp.choice("replay_buffer_batch_size", [128, 512, 1024]),
        # "memory_learning_start": hp.choice("memory_learning_start", [50000]),
        "agent_gamma":
        hp.uniform("agent_gamma", 0.95, 0.999),
        "agent_gamma":
        hp.choice("agent_gamma", [0.95, 0.995])
    }

    reporter = CLIReporter()
    reporter.add_metric_column("mean_rewards")
    reporter.add_metric_column("reward")

    ahb = AsyncHyperBandScheduler(time_attr="training_iteration",
                                  metric="mean_rewards",
                                  mode="max",
                                  grace_period=500,
                                  max_t=3600)

    tune.run(
        LunarTrainer,
        name="asynHyber-lunar-ddpg",
        scheduler=ahb,
        config=config,
        queue_trials=True,
        num_samples=10,
        progress_reporter=reporter,
        resources_per_trial={
            "cpu": 3,
            "gpu": 0.2
        },
        search_alg=HyperOptSearch(space=space,
                                  max_concurrent=4,
                                  metric="mean_rewards",
                                  mode="max"),
        checkpoint_freq=20,
        checkpoint_at_end=True,
        verbose=1,
    )
        use_fp16=args.fp16)

    pbt_scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="val_loss",
        mode="min",
        perturbation_interval=1,
        hyperparam_mutations={
            # distribution for resampling
            "lr": lambda: np.random.uniform(0.001, 1),
            # allow perturbations within this set of categorical values
            "momentum": [0.8, 0.9, 0.99],
        })

    reporter = CLIReporter()
    reporter.add_metric_column("val_loss", "loss")
    reporter.add_metric_column("val_accuracy", "acc")

    analysis = tune.run(
        TorchTrainable,
        num_samples=4,
        config={
            "lr": tune.choice([0.001, 0.01, 0.1]),
            "momentum": 0.8
        },
        stop={"training_iteration": 2 if args.smoke_test else 100},
        max_failures=3,  # used for fault tolerance
        checkpoint_freq=3,  # used for fault tolerance
        keep_checkpoints_num=1,  # used for fault tolerance
        verbose=2,
        progress_reporter=reporter,
    Trainable = trainer.to_tune_trainable(train_func)
    pbt_scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="loss",
        mode="min",
        perturbation_interval=1,
        hyperparam_mutations={
            # distribution for resampling
            "lr": lambda: np.random.uniform(0.001, 1),
            # allow perturbations within this set of categorical values
            "momentum": [0.8, 0.9, 0.99],
        },
    )

    reporter = CLIReporter()
    reporter.add_metric_column("loss", "loss")

    analysis = tune.run(
        Trainable,
        num_samples=4,
        config={
            "lr": tune.choice([0.001, 0.01, 0.1]),
            "momentum": 0.8,
            "batch_size": 128 * args.num_workers,
            "epochs": args.num_epochs,
            "test_mode": args.smoke_test,  # whether to to subset the data
        },
        stop={"training_iteration": 2 if args.smoke_test else 100},
        max_failures=3,  # used for fault tolerance
        checkpoint_freq=3,  # used for fault tolerance
        keep_checkpoints_num=1,  # used for fault tolerance
Example #5
0
File: tune.py Project: vakker/cc19
def main(args):
    utils.init_random()

    exp_configs, tune_configs = utils.get_tune_configs(args.logdir)

    hparams = {}
    parameters = []
    for param_subset, params in tune_configs.items():
        hparams[param_subset] = []
        for param, options in params.items():
            parameters.append({'name': param, **options})
            hparams[param_subset].append(param)

    exp_configs['hparams'] = hparams

    exp_configs['data_params']['subset'] = args.subset
    exp_configs['data_params']['workers'] = args.ds_workers
    max_epochs = 2 if args.smoke else args.max_epochs
    num_samples = 2 if args.smoke else args.num_samples
    exp_configs.update({'num_gpus': 1})

    # ray.init()
    ray.init(memory=2000 * 1024 * 1024,
             object_store_memory=200 * 1024 * 1024,
             driver_object_store_memory=100 * 1024 * 1024)

    scheduler = AsyncHyperBandScheduler(time_attr="training_iteration",
                                        metric="val_accuracy",
                                        mode="max",
                                        grace_period=5,
                                        max_t=max(max_epochs, 5))
    client = AxClient(enforce_sequential_optimization=True)
    client.create_experiment(parameters=parameters,
                             objective_name='val_accuracy')
    search_alg = AxSearch(client, max_concurrent=1, mode='max')
    # search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2)
    reporter = CLIReporter()
    reporter.add_metric_column("val_accuracy")
    reporter.add_metric_column("train_loss")
    trainable = TorchTrainer.as_trainable(
        model_creator=utils.model_creator,
        data_creator=utils.data_creator,
        optimizer_creator=utils.optimizer_creator,
        loss_creator=utils.loss_creator,
        scheduler_creator=utils.scheduler_creator,
        scheduler_step_freq="epoch",
        use_gpu=True,
        config={BATCH_SIZE: exp_configs['batch_size']},
        num_workers=args.workers)
    analysis = tune.run(trainable,
                        num_samples=num_samples,
                        config=exp_configs,
                        trial_name_creator=utils.trial_str_creator,
                        progress_reporter=reporter,
                        scheduler=scheduler,
                        search_alg=search_alg,
                        stop={"training_iteration": max_epochs},
                        local_dir=args.logdir,
                        checkpoint_freq=10,
                        checkpoint_at_end=True,
                        keep_checkpoints_num=3,
                        resume=args.resume,
                        checkpoint_score_attr='val_accuracy',
                        max_failures=2,
                        verbose=1)
Example #6
0
def tune_learner_on_stream(learner, learner_name, task_level_tuning,
                           stream, redis_address, local_mode, num_hp_samplings,
                           vis_params, exp_dir, seed, **training_params):
    """
    Returns 2 dataframes:
     - The first one contains information about the best trajectory and
     contains as many rows as there are tasks. Each row corresponding to the
     model trained on the corresponding task in the best trajectory.
      - The second contains one row per hyper-parameters combination. Each
      Row corresponds contains information about the results on all tasks for
      this specific hp combination. Note that, *in the task-level hp optim
      settting*, this DF is useful to investigate the behaviors of specific
      trainings, but rows *DOES NOT* correspond to actual trajectories.
    """

    exp_name = os.path.basename(exp_dir)
    init_path = path.join(exp_dir, 'model_initializations', learner_name)
    torch.save(learner, init_path)
    config = {**learner.get_search_space(),
              'training-params': training_params,
              'tasks': stream,
              'vis_params': vis_params,
              # 'learner': learner,
              'learner_path': init_path,
              'task_level_tuning': task_level_tuning,
              # 'env': learner_name
              'seed': seed
              }


    def trial_name_creator(trial):
        return learner_name
        # return '{}_{}'.format(learner_name, trial.trial_id)

    reporter = CLIReporter(max_progress_rows=10)
    # reporter.add_metric_column('avg_acc_val')
    reporter.add_metric_column('avg_acc_val_so_far', 'avg_val')
    reporter.add_metric_column('avg_acc_test_so_far', 'avg_test')
    reporter.add_metric_column('total_params')
    # reporter.add_metric_column('fw_t')
    # reporter.add_metric_column('data_t')
    # reporter.add_metric_column('eval_t')
    # reporter.add_metric_column('epoch_t')
    reporter.add_metric_column('duration_model_creation', 'creat_t')
    reporter.add_metric_column('duration_training', 'train_t')
    reporter.add_metric_column('duration_postproc', 'pp_t')
    reporter.add_metric_column('duration_finish', 'fin_t')
    reporter.add_metric_column('duration_eval', 'ev_t')
    reporter.add_metric_column('duration_sum', 'sum_t')
    reporter.add_metric_column('duration_seconds', 'step_t')
    reporter.add_metric_column('total_t')
    reporter.add_metric_column('t')

    ray_params = dict(
        loggers=[JsonLogger, CSVLogger],
        name=learner_name,
        resources_per_trial=learner.ray_resources,
        num_samples=num_hp_samplings,
        local_dir=exp_dir,
        verbose=1,
        progress_reporter=reporter,
        trial_name_creator=trial_name_creator,
        max_failures=3,
    )
    envs = []
    all_val_accs = defaultdict(list)
    all_test_accs = defaultdict(list)
    if task_level_tuning:
        best_trials_df = []
        config['ray_params'] = ray_params
        config['local_mode'] = local_mode
        config['redis_address'] = redis_address
        analysis, selected = train_on_tasks(config)
        for t_id, (task, task_an) in enumerate(zip(stream, analysis)):
            # envs.append([])
            for trial_n, t in enumerate(task_an.trials):
                if len(envs) <= trial_n:
                    envs.append([])
                env = '{}_Trial_{}_{}_{}'.format(exp_name, t, t.experiment_tag,
                                                 task['descriptor'])
                envs[trial_n].append(env)
                if selected[t_id] == t.experiment_tag:
                    all_val_accs[t.experiment_tag].append(
                        '<span style="font-weight:bold">{}</span>'.format(
                        t.last_result[f'Val_T{t_id}']))
                else:
                    all_val_accs[t.experiment_tag].append(
                        t.last_result[f'Val_T{t_id}'])
                all_test_accs[t.experiment_tag].append(
                    t.last_result[f'Test_T{t_id}']
                )

            best_trial = max(task_an.trials,
                         key=lambda trial: trial.last_result['avg_acc_val_so_far'])

            df = task_an.trial_dataframes[best_trial.logdir]
            best_trials_df.append(df)

        return_df = pandas.concat(best_trials_df, ignore_index=True)
        analysis = analysis[-1]
        results = sorted(analysis.trials, reverse=True,
                         key=lambda trial: trial.last_result['avg_acc_val_so_far'])
    else:
        if not ray.is_initialized():
            if local_mode:
                ray.init(local_mode=local_mode)
            else:
                ray.init(redis_address)
                # logging_level=logging.DEBUG)
        ray_params['config'] = config
        analysis = tune.run(train_on_tasks, **ray_params)

        results = sorted(analysis.trials, reverse=True,
                         key=lambda trial: trial.last_result['avg_acc_val_so_far'])
        for t in results:
            envs.append([])
            for task in stream:
                env = '{}_Trial_{}_{}_{}'.format(exp_name, t, t.experiment_tag,
                                                 task['descriptor'])
                envs[-1].append(env)
        return_df = analysis.trial_dataframes[results[0].logdir]
    summary = {
        'model': [t.experiment_tag for t in results],
        'Avg acc Val': [t.last_result['avg_acc_val'] for t in results],
        'Acc Val': [all_val_accs[t.experiment_tag] for t in results],
        'Avg acc Test': [t.last_result['avg_acc_test'] for t in results],
        'Acc Test': [all_test_accs[t.experiment_tag] for t in results],
        'Params': [t.last_result['total_params'] for t in results],
        'Steps': [t.last_result['total_steps'] for t in results],
        'paths': [t.logdir for t in results],
        'evaluated_params': [t.evaluated_params for t in results],
        'envs': envs
    }
    summary = pandas.DataFrame(summary)

    return return_df, summary
Example #7
0
import os

from ray import tune
from ray.tune.schedulers import MedianStoppingRule
from hyperopt import hp
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune import CLIReporter

from some_model_to_train import SomeModelToTrain

MODEL_FILENAME = "checkpoint.pth"
TUNE_RESULTS_FOLDER = './ray_results/'
MAX_TRAINING_ITERATION = 2000

reporter = CLIReporter(max_progress_rows=10)
reporter.add_metric_column("mean_reward")


class Trainable(tune.Trainable):
    def setup(self, hyperparameter):
        self.someModelToTrain = SomeModelToTrain(hyperparameter)

    def step(self):
        mean_reward = self.someModelToTrain.train_one_episode()
        return {'mean_reward': mean_reward}

    def save_checkpoint(self, tmp_checkpoint_dir):
        checkpoint_path = os.path.join(tmp_checkpoint_dir, MODEL_FILENAME)
        self.someModelToTrain.save(checkpoint_path)
        return tmp_checkpoint_dir
Example #8
0
def train(
        trainer,
        config,
        stop,
        exp_name,
        num_seeds=1,
        num_gpus=0,
        test_mode=False,
        suffix="",
        checkpoint_freq=10,
        keep_checkpoints_num=None,
        start_seed=0,
        local_mode=False,
        save_pkl=True,
        custom_callback=None,
        **kwargs
):
    # initialize ray
    if not os.environ.get("redis_password"):
        initialize_ray(test_mode=test_mode, local_mode=local_mode, num_gpus=num_gpus)
    else:
        password = os.environ.get("redis_password")
        assert os.environ.get("ip_head")
        print(
            "We detect redis_password ({}) exists in environment! So "
            "we will start a ray cluster!".format(password)
        )
        if num_gpus:
            print(
                "We are in cluster mode! So GPU specification is disable and"
                " should be done when submitting task to cluster! You are "
                "requiring {} GPU for each machine!".format(num_gpus)
            )
        initialize_ray(address=os.environ["ip_head"], test_mode=test_mode, redis_password=password)

    # prepare config
    used_config = {
        "seed": tune.grid_search([i * 100 + start_seed for i in range(num_seeds)]),
        "log_level": "DEBUG" if test_mode else "INFO",
        "callbacks": custom_callback if custom_callback else DrivingCallbacks,  # Must Have!
    }
    if config:
        used_config.update(config)
    config = copy.deepcopy(used_config)

    trainer_name = trainer if isinstance(trainer, str) else trainer._name

    if not isinstance(stop, dict) and stop is not None:
        assert np.isscalar(stop)
        stop = {"timesteps_total": int(stop)}

    if keep_checkpoints_num is not None and not test_mode:
        assert isinstance(keep_checkpoints_num, int)
        kwargs["keep_checkpoints_num"] = keep_checkpoints_num
        kwargs["checkpoint_score_attr"] = "episode_reward_mean"

    if "verbose" not in kwargs:
        kwargs["verbose"] = 1 if not test_mode else 2

    # This functionality is not supported yet!
    metric_columns = CLIReporter.DEFAULT_COLUMNS.copy()
    progress_reporter = CLIReporter(metric_columns)
    progress_reporter.add_metric_column("success")
    progress_reporter.add_metric_column("crash")
    progress_reporter.add_metric_column("out")
    progress_reporter.add_metric_column("max_step")
    progress_reporter.add_metric_column("length")
    kwargs["progress_reporter"] = progress_reporter

    # start training
    analysis = tune.run(
        trainer,
        name=exp_name,
        checkpoint_freq=checkpoint_freq,
        checkpoint_at_end=True,
        stop=stop,
        config=config,
        max_failures=20 if not test_mode else 1,
        reuse_actors=False,
        local_dir="data",
        **kwargs
    )

    # save training progress as insurance
    if save_pkl:
        pkl_path = "{}-{}{}.pkl".format(exp_name, trainer_name, "" if not suffix else "-" + suffix)
        with open(pkl_path, "wb") as f:
            data = analysis.fetch_trial_dataframes()
            pickle.dump(data, f)
            print("Result is saved at: <{}>".format(pkl_path))
    return analysis
Example #9
0
    # Search algorithm
    # search_alg = HyperOptSearch()

    # Restore previous search state checkpoint
    # search_alg_state = os.path.join(local_dir, exp_name)
    # if os.path.isdir(search_alg_state):
    #     print('Restore search state:', search_alg_state)
    #     search_alg.restore_from_dir(search_alg_state)

    # Repeat each trial 3 times, not recommended to use with TrialSchedulers
    # search_alg = Repeater(searcher=search_alg, repeat=3)
    # search_alg = ConcurrencyLimiter(search_alg, max_concurrent=max(num_cpus, num_gpus))

    # Progress reporter
    reporter = CLIReporter()
    reporter.add_metric_column(metric='train_loss')
    reporter.add_metric_column(metric='train_acc')
    reporter.add_metric_column(metric='valid_loss')
    reporter.add_metric_column(metric='valid_acc')

    for i in range(len(test_files_list)):
        reporter.add_metric_column(metric='test{}_loss'.format(i))
        reporter.add_metric_column(metric='test{}_acc'.format(i))

    # Ray tune - local_dir/exp_name/trial_name_x
    analysis = tune.run(
        partial(train_audioset, train_files=train_files, valid_files=valid_files, test_files_list=test_files_list),
        metric='valid_acc',
        mode='max',
        name=exp_name,
        stop={'training_iteration': 1 if smoke_test else max_num_epochs},