Beispiel #1
0
def aug_opt(
        name,
        train_spec,
        scheduler,
        inner_config_dir,
        resume=None,
        fresh=False,
        search_alg=None,
):
    from ray.tune import run
    from ray.tune import Experiment
    train_spec['config']['config_dir'] = inner_config_dir
    if resume is None:
        resume = not fresh

    experiment = Experiment.from_json(name=name, spec=train_spec)
    if fresh and os.path.exists(experiment.local_dir):
        import shutil
        shutil.rmtree(experiment.local_dir)

    run(experiment,
        name=name,
        scheduler=scheduler,
        reuse_actors=True,
        verbose=True,
        resume=resume,
        search_alg=search_alg,
        trial_name_creator=tune.function(trial_str_creator))
def create_expe_spec(config, n_cpu, n_gpu, exp_dir):
    def _trial_name_creator(trial):
        return "{}_{}_123".format(trial.trainable_name, trial.trial_id)

    # Create env and register it, so ray and rllib can use it
    register_env(config["env_config"]["env"],
                 lambda env_config: env_basic_creator(env_config))

    expe_config = merge_env_algo_config(config)

    # Shouldn't be useful now, automatic in RLLIB
    #trial_resources = {"cpu": expe_config["num_workers"]+3, "gpu": expe_config["num_gpus"]}

    # expe_config["lr"] = grid_search([1e-3, 1e-4, 5e-4, 1e-5, 5e-5])
    # expe_config["target_network_update_freq"] = grid_search([20000, 40000])

    experiment = Experiment(
        name=config["name_expe"],
        run=config["algo"],
        stop=config["stop"],
        config=expe_config,
        num_samples=config.get("num_samples", 1),
        checkpoint_freq=10,
        max_failures=2,
        local_dir=exp_dir,
        # trial_name_creator=tune.function(_trial_name_creator)
        # todo : add when available
    )

    return experiment
Beispiel #3
0
 def _setup_create_experiment_checkpoint_dir(
         self, run_config: Optional[RunConfig]) -> str:
     """Sets up experiment checkpoint dir before actually running the experiment."""
     path = Experiment.get_experiment_checkpoint_dir(
         self._convert_trainable(self._trainable),
         run_config.local_dir,
         run_config.name,
     )
     if not os.path.exists(path):
         os.makedirs(path)
     return path
Beispiel #4
0
def create_experiment(args):
    """
    Create a single experiment from arguments.
    :param args: The parsed arguments.
    :return: A new experiment with its own trainer.
    """
    experiment_name = get_experiment_name(args)
    config = build_experiment_config_dict(args)
    trainer = get_trainer(args=args, config=config)
    experiment_dict = build_experiment_dict(args, experiment_name, trainer, config)
    return Experiment(**experiment_dict)
def main():
    args = parse_args()
    save_path = args.save_path = os.path.join(args.save_folder, args.arch)
    os.makedirs(save_path)
    #os.makedirs(save_path, exist_ok=True)

    # config
    args.logger_file = os.path.join(save_path, 'log_{}.txt'.format(args.cmd))

    handlers = [
        logging.FileHandler(args.logger_file, mode='w'),
        logging.StreamHandler()
    ]
    logging.basicConfig(level=logging.INFO,
                        datefmt='%m-%d-%y %H:%M',
                        format='%(asctime)s:%(message)s',
                        handlers=handlers)

    if args.cmd == 'train':
        logging.info('start training {}'.format(args.arch))
        run_training(args)
    elif args.cmd == 'test':
        logging.info('start evaluating {} with checkpoints from {}'.format(
            args.arch, args.resume))
        test_model(args)
    elif args.cmd == 'tune':
        import ray
        import ray.tune as tune
        from ray.tune import Experiment
        from ray.tune.median_stopping_rule import MedianStoppingRule

        ray.init()
        sched = MedianStoppingRule(time_attr="timesteps_total",
                                   reward_attr="neg_mean_loss")
        tune.register_trainable(
            "run_training",
            lambda cfg, reporter: run_training(args, cfg, reporter))
        experiment = Experiment(
            "train_rl",
            "run_training",
            trial_resources={"gpu": 1},
            config={"alpha": tune.grid_search([0.1, 0.01, 0.001])})
        tune.run_experiments(experiment, scheduler=sched, verbose=False)
Beispiel #6
0
    def to_ray_experiment(self) -> Experiment:
        with open(self._parameter_file, "r") as parameter_f:
            parameter_file_snippet = parameter_f.read()

        trainable_name = f"{self._experiment_name}_train_func_{self._run_id or '0'}"

        logger.info(f"Register run parameters: {self._run_parameters}")

        register_trainable(
            trainable_name,
            train_func(
                run_parameters=self._run_parameters,
                parameter_file_snippet=parameter_file_snippet,
                current_working_dir=self._current_working_dir,
                serialization_dir=self._serialization_dir,
                include_packages=self._include_packages,
                gpus_available=self._gpus_available,
            ),
        )

        def trial_name_creator(trial):
            params = ",".join([
                f"{k}={v}" for k, v in sorted(self._run_parameters.items(),
                                              key=lambda kv: kv[0])
            ])
            return f"{trial}_{params}"

        config = self._hyperparameters or {}
        return Experiment(
            name=self._experiment_name,
            run=trainable_name,
            config=config,
            resources_per_trial=self._resources_per_trial,
            local_dir=self._log_dir,
            trial_name_creator=tune.function(trial_name_creator)
            if self._run_parameters else None,
        )
Beispiel #7
0
    def train_model_on_task(self, task, task_viz, exp_dir, use_ray,
                            use_ray_logging, grace_period,
                            num_hp_samplings, local_mode,
                            redis_address, lca_n, **training_params):
        logger.info("Training dashboard: {}".format(get_env_url(task_viz)))
        t_id = task['id']

        trainable = self.get_trainable(use_ray_logging=use_ray_logging)
        past_tasks = training_params.pop('past_tasks')
        normalize = training_params.pop('normalize')
        augment_data = training_params.pop('augment_data')

        transformations = []
        if augment_data:
            transformations.extend([
                transforms.ToPILImage(),
                transforms.RandomHorizontalFlip(),
                transforms.RandomCrop(32, 4),
                transforms.ToTensor()
            ])
        t_trans = [[] for _ in range(len(task['split_names']))]
        t_trans[0] = transformations
        datasets = trainable._load_datasets(task,
                                            task['loss_fn'],
                                            past_tasks, t_trans, normalize)
        train_loader, eval_loaders = get_classic_dataloaders(datasets,
                                                             training_params.pop(
                                                                 'batch_sizes'))
        model = self.get_model(task_id=t_id, x_dim=task['x_dim'],
                               n_classes=task['n_classes'],
                               descriptor=task['descriptor'],
                               dataset=eval_loaders[:2])

        if use_ray:
            if not ray.is_initialized():
                ray.init(address=redis_address)

            scheduler = None

            training_params['loss_fn'] = tune.function(
                training_params['loss_fn'])
            training_params['optim_func'] = tune.function(self.optim_func)

            init_model_path = os.path.join(exp_dir, 'model_initializations')
            model_file_name = '{}_init.pth'.format(training_params['name'])
            model_path = os.path.join(init_model_path, model_file_name)
            torch.save(model, model_path)

            training_params['model_path'] = model_path
            config = {**self.get_search_space(),
                      'training-params': training_params}
            if use_ray_logging:
                stop_condition = {'training_iteration':
                                      training_params['n_it_max']}
                checkpoint_at_end = False
                keep_checkpoints_num = 1
                checkpoint_score_attr = 'min-Val nll'
            else:
                stop_condition = None
                # loggers = [JsonLogger, MyCSVLogger]
                checkpoint_at_end = False
                keep_checkpoints_num = None
                checkpoint_score_attr = None

            trainable = rename_class(trainable, training_params['name'])
            experiment = Experiment(
                name=training_params['name'],
                run=trainable,
                stop=stop_condition,
                config=config,
                resources_per_trial=self.ray_resources,
                num_samples=num_hp_samplings,
                local_dir=exp_dir,
                loggers=(JsonLogger, CSVLogger),
                checkpoint_at_end=checkpoint_at_end,
                keep_checkpoints_num=keep_checkpoints_num,
                checkpoint_score_attr=checkpoint_score_attr)

            analysis = tune.run(experiment,
                                scheduler=scheduler,
                                verbose=1,
                                raise_on_failed_trial=True,
                                # max_failures=-1,
                                # with_server=True,
                                # server_port=4321
                                )
            os.remove(model_path)
            logger.info("Training dashboard: {}".format(get_env_url(task_viz)))

            all_trials = {t.logdir: t for t in analysis.trials}
            best_logdir = analysis.get_best_logdir('Val nll', 'min')
            best_trial = all_trials[best_logdir]

            # picked_metric = 'accuracy_0'
            # metric_names = {s: '{} {}'.format(s, picked_metric) for s in
            #                 ['Train', 'Val', 'Test']}

            logger.info('Best trial: {}'.format(best_trial))
            best_res = best_trial.checkpoint.result
            best_point = (best_res['training_iteration'], best_res['Val nll'])

            # y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss']
            y_keys = ['Val nll', 'Train nll']

            epoch_key = 'training_epoch'
            it_key = 'training_iteration'
            plot_res_dataframe(analysis, training_params['name'], best_point,
                               task_viz, epoch_key, it_key, y_keys)
            if 'entropy' in next(iter(analysis.trial_dataframes.values())):
                plot_res_dataframe(analysis, training_params['name'], None,
                                    task_viz, epoch_key, it_key, ['entropy'])
            best_model = self.get_model(task_id=t_id)
            best_model.load_state_dict(torch.load(best_trial.checkpoint.value))

            train_accs = analysis.trial_dataframes[best_logdir]['Train accuracy_0']
            best_t = best_res['training_iteration']
            t = best_trial.last_result['training_iteration']
        else:
            search_space = self.get_search_space()
            rand_config = list(generate_variants(search_space))[0][1]
            learner_params = rand_config.pop('learner-params', {})
            optim_params = rand_config.pop('optim')


            split_optims = training_params.pop('split_optims')
            if hasattr(model, 'set_h_params'):
                model.set_h_params(**learner_params)
            if hasattr(model, 'train_loader_wrapper'):
                train_loader = model.train_loader_wrapper(train_loader)

            loss_fn = task['loss_fn']
            if hasattr(model, 'loss_wrapper'):
                loss_fn = model.loss_wrapper(task['loss_fn'])

            prepare_batch = _prepare_batch
            if hasattr(model, 'prepare_batch_wrapper'):
                prepare_batch = model.prepare_batch_wrapper(prepare_batch, t_id)

            optim_fact = partial(set_optim_params,
                                 optim_func=self.optim_func,
                                 optim_params=optim_params,
                                 split_optims=split_optims)
            if hasattr(model, 'train_func'):
                f = model.train_func
                t, metrics, b_state_dict = f(train_loader=train_loader,
                                                eval_loaders=eval_loaders,
                                                optim_fact=optim_fact,
                                                loss_fn=loss_fn,
                                                split_names=task['split_names'],
                                                viz=task_viz,
                                                prepare_batch=prepare_batch,
                                                **training_params)
            else:
                optim = optim_fact(model=model)
                t, metrics, b_state_dict = train(model=model,
                                                 train_loader=train_loader,
                                                 eval_loaders=eval_loaders,
                                                 optimizer=optim,
                                                 loss_fn=loss_fn,
                                                 split_names=task['split_names'],
                                                 viz=task_viz,
                                                 prepare_batch=prepare_batch,
                                                 **training_params)
            train_accs = metrics['Train accuracy_0']
            best_t = b_state_dict['iter']
            if 'training_archs' in metrics:
                plot_trajectory(model.ssn.graph, metrics['training_archs'],
                                model.ssn.stochastic_node_ids, task_viz)
                weights = model.arch_sampler().squeeze()
                archs = model.ssn.get_top_archs(weights, 5)
                list_top_archs(archs, task_viz)
                list_arch_scores(self.arch_scores[t_id], task_viz)
                update_summary(self.arch_scores[t_id], task_viz, 'scores')

        if len(train_accs) > lca_n:
            lca_accs = []
            for i in range(lca_n + 1):
                if i in train_accs:
                    lca_accs.append(train_accs[i])
                else:
                    logger.warning('Missing step for {}/{} for lca computation'
                                   .format(i, lca_n))
            lca = np.mean(lca_accs)
        else:
            lca = np.float('nan')
        stats = {}
        start = time.time()
        # train_idx = task['split_names'].index('Train')
        # train_path = task['data_path'][train_idx]
        # train_dataset = _load_datasets([train_path])[0]
        train_dataset = _load_datasets(task, 'Train')[0]
        stats.update(self.finish_task(train_dataset, t_id, task_viz,
                                      path='drawings'))
        stats['duration'] = {'iterations': t,
                             'finish': time.time() - start,
                             'best_iterations': best_t}
        stats['params'] = {'total': self.n_params(t_id),
                           'new': self.new_params(t_id)}
        stats['lca'] = lca
        return stats
    # 从给定的检查点恢复(训练出错)
    def _restore(self, checkpoint_path):
        with open(checkpoint_path) as f:
            self.timestep = json.loads(f.read())["timestep"]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--smoke-test", action="store_true", help="Finish quickly for testing")
    args, _ = parser.parse_known_args()
    ray.init()
    exp = Experiment(
        name="hyperband_test",
        run=MyTrainableClass,
        #  num_samples 从超参数空间抽样两次,此代码即训练两次可训练函数MyTrainableClass
        num_samples=10,
        # 可训练函数重命名
        # trial_name_creator=tune.function(trial_str_creator),
        # 打印训练过程中每次试验的状态信息。
        # loggers=[TestLogger],
        stop={"training_iteration": 1 if args.smoke_test else 999},
        # random.random()生成0和1之间的随机浮点数float
        # config参数的含义是  width取值空间为[10, 100], height 取值空间为 [0, 100]
        config={
            "width": tune.sample_from(lambda spec: 10 + int(90 * random.random())),
            "height": tune.sample_from(lambda spec: int(100 * random.random()))
        })

    trials = run(exp)
Beispiel #9
0
    def testPointsToEvaluateBasicVariantAdvanced(self):
        config = {
            "grid_1": tune.grid_search(["a", "b", "c", "d"]),
            "grid_2": tune.grid_search(["x", "y", "z"]),
            "nested": {
                "random":
                tune.uniform(2., 10.),
                "dependent":
                tune.sample_from(lambda spec: -1. * spec.config.nested.random)
            }
        }

        points = [
            {
                "grid_1": "b"
            },
            {
                "grid_2": "z"
            },
            {
                "grid_1": "a",
                "grid_2": "y"
            },
            {
                "nested": {
                    "random": 8.0
                }
            },
        ]

        from ray.tune.suggest.basic_variant import BasicVariantGenerator

        # grid_1 * grid_2 are 3 * 4 = 12 variants per complete grid search
        # However if one grid var is set by preset variables, that run
        # is excluded from grid search.

        # Point 1 overwrites grid_1, so the first trial only grid searches
        # over grid_2 (3 trials).
        # The remaining 5 trials search over the whole space (5 * 12 trials)
        searcher = BasicVariantGenerator(points_to_evaluate=[points[0]])
        exp = Experiment(run=_mock_objective,
                         name="test",
                         config=config,
                         num_samples=6)
        searcher.add_configurations(exp)
        self.assertEqual(searcher.total_samples, 1 * 3 + 5 * 12)

        # Point 2 overwrites grid_2, so the first trial only grid searches
        # over grid_1 (4 trials).
        # The remaining 5 trials search over the whole space (5 * 12 trials)
        searcher = BasicVariantGenerator(points_to_evaluate=[points[1]])
        exp = Experiment(run=_mock_objective,
                         name="test",
                         config=config,
                         num_samples=6)
        searcher.add_configurations(exp)
        self.assertEqual(searcher.total_samples, 1 * 4 + 5 * 12)

        # Point 3 overwrites grid_1 and grid_2, so the first trial does not
        # grid search.
        # The remaining 5 trials search over the whole space (5 * 12 trials)
        searcher = BasicVariantGenerator(points_to_evaluate=[points[2]])
        exp = Experiment(run=_mock_objective,
                         name="test",
                         config=config,
                         num_samples=6)
        searcher.add_configurations(exp)
        self.assertEqual(searcher.total_samples, 1 + 5 * 12)

        # When initialized with all points, the first three trials are
        # defined by the logic above. Only 3 trials are grid searched
        # compeletely.
        searcher = BasicVariantGenerator(points_to_evaluate=points)
        exp = Experiment(run=_mock_objective,
                         name="test",
                         config=config,
                         num_samples=6)
        searcher.add_configurations(exp)
        self.assertEqual(searcher.total_samples, 1 * 3 + 1 * 4 + 1 + 3 * 12)

        # Run this and confirm results
        analysis = tune.run(exp, search_alg=searcher)
        configs = [trial.config for trial in analysis.trials]

        self.assertEqual(len(configs), searcher.total_samples)
        self.assertTrue(all(config["grid_1"] == "b"
                            for config in configs[0:3]))
        self.assertTrue(all(config["grid_2"] == "z"
                            for config in configs[3:7]))
        self.assertTrue(configs[7]["grid_1"] == "a"
                        and configs[7]["grid_2"] == "y")
        self.assertTrue(configs[8]["nested"]["random"] == 8.0)
        self.assertTrue(configs[8]["nested"]["dependent"] == -8.0)
Beispiel #10
0
            'optimizer': self.optimizer.state_dict()
        }
        torch.save(cpd, checkpoint_dir + "/save")

    def _restore(self, path):
        cpd = torch.load(path)
        self.iteration = cpd['iteration']
        self.sc.load_state_dict(cpd['state_dict'])
        self.optimizer.load_state_dict(cpd['optimizer'])


if __name__ == "__main__":
    ray.init()
    dset = TensorDataset(
        torch.randn(100, 64, 1024), torch.randn(100, 1024),
        torch.randint(100, size=(100, )).type(torch.LongTensor))

    dset_id = pin_in_object_store(dset)
    tune.register_trainable('train_sc', Trainer)
    exp = Experiment(name="speaker classification",
                     run='train_sc',
                     stop={"timesteps_total": 1},
                     config={
                         "lr": 1e-3,
                         "dset_id": dset_id,
                         "nspeakers": 100,
                         "batch_size": 1,
                     })

    tune.run_experiments(exp)
Beispiel #11
0
    def train_model_on_task(self, task, task_viz, exp_dir, use_ray,
                            use_ray_logging, smoke_test, n_it_max, grace_period,
                            num_hp_samplings, local_mode, tune_register_lock,
                            resources, **training_params):
        logger.info("Training dashboard: {}".format(get_env_url(task_viz)))

        model = self.get_model(task_id=task.id)
        trainable = self.get_trainable(use_ray_logging=use_ray_logging)

        self.prepare_task(task, training_params)

        if use_ray:
            # Required to avoid collisions in Tune's global Registry:
            # https://github.com/ray-project/ray/blob/master/python/ray/tune/registry.py
            trainable = rename_class(trainable, training_params['name'])

            scheduler = None


            training_params['loss_fn'] = tune.function(
                training_params['loss_fn'])
            training_params['optim_func'] = tune.function(self.optim_func)
            training_params['n_it_max'] = n_it_max

            init_model_path = os.path.join(exp_dir, 'model_initializations')
            model_file_name = '{}_init.pth'.format(training_params['name'])
            model_path = os.path.join(init_model_path, model_file_name)
            torch.save(model, model_path)

            training_params['model_path'] = model_path
            config = {'hyper-params': self.get_search_space(smoke_test),
                      'tp': training_params}
            if use_ray_logging:
                stop_condition = {'training_iteration': n_it_max}
                loggers = None
            else:
                stop_condition = None
                loggers = [JsonLogger, MyCSVLogger]

            # We need to create the experiment using a lock here to avoid issues
            # with Tune's global registry, more specifically with the
            # `_to_flush` dict that may change during the iteration over it.
            # https://github.com/ray-project/ray/blob/e3c9f7e83a6007ded7ae7e99fcbe9fcaa371bad3/python/ray/tune/registry.py#L91-L93
            tune_register_lock.acquire()
            experiment = Experiment(
                name=training_params['name'],
                run=trainable,
                stop=stop_condition,
                config=config,
                resources_per_trial=resources,
                num_samples=num_hp_samplings,
                local_dir=exp_dir,
                loggers=loggers,
                keep_checkpoints_num=1,
                checkpoint_score_attr='min-mean_loss')
            tune_register_lock.release()

            analysis = tune.run(experiment,
                                scheduler=scheduler,
                                verbose=1,
                                raise_on_failed_trial=True,
                                # max_failures=-1,
                                # with_server=True,
                                # server_port=4321
                                )
            os.remove(model_path)
            logger.info("Training dashboard: {}".format(get_env_url(task_viz)))

            all_trials = {t.logdir: t for t in analysis.trials}
            best_logdir = analysis.get_best_logdir('mean_loss', 'min')
            best_trial = all_trials[best_logdir]

            # picked_metric = 'accuracy_0'
            # metric_names = {s: '{} {}'.format(s, picked_metric) for s in
            #                 ['Train', 'Val', 'Test']}

            logger.info('Best trial: {}'.format(best_trial))
            best_res = best_trial._checkpoint.last_result
            best_point = (best_res['training_iteration'], best_res['mean_loss'])

            y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss']
            epoch_key = 'training_epoch'
            it_key = 'training_iteration' if use_ray_logging else 'training_iterations'
            plot_res_dataframe(analysis, training_params['name'], best_point,
                               task_viz, epoch_key, it_key, y_keys)
            best_model = self.get_model(task_id=task.id)
            best_model.load_state_dict(torch.load(best_trial._checkpoint.value))

            t = best_trial._checkpoint.last_result['training_iteration']
        else:
            data_path = training_params.pop('data_path')
            past_tasks = training_params.pop('past_tasks')
            datasets = trainable._load_datasets(data_path,
                                                training_params['loss_fn'],
                                                past_tasks)
            train_loader, eval_loaders = get_classic_dataloaders(datasets,
                                                                 training_params.pop('batch_sizes'))
            optim = self.optim_func(model.parameters())

            t, accs, best_state_dict = train(model, train_loader, eval_loaders,
                                             optimizer=optim, viz=task_viz,
                                             n_it_max=n_it_max, **training_params)
        logger.info('Finishing task ...')
        t1 = time.time()
        self.finish_task(task.datasets[0])
        logger.info('done in {}s'.format(time.time() - t1))

        return t
Beispiel #12
0
class FCNetSliceLocalizationTrainable(AbstractFCNetTrainable):
    BENCHMARK_CLASS = FCNetSliceLocalizationBenchmark


class FCNetNavalPropulsionTrainable(AbstractFCNetTrainable):
    BENCHMARK_CLASS = FCNetNavalPropulsionBenchmark


class FCNetParkinsonsTelemonitoringTrainable(AbstractFCNetTrainable):
    BENCHMARK_CLASS = FCNetParkinsonsTelemonitoringBenchmark


if __name__ == "__main__":
    """Example with FCNetProteinStructure and Hyperband."""
    download_fcnet()
    hyperband = HyperBandScheduler(
        time_attr="training_iteration",
        metric="episode_reward_mean",
        mode="max",
        max_t=100)

    exp = Experiment(
        name="hyperband_fcnet_protein_test",
        run=FCNetProteinStructureTrainable,
        num_samples=20,
        stop={"training_iteration": 1},
        config=FCNetProteinStructureTrainable().get_configuration_space())

    run(exp, scheduler=hyperband)
Beispiel #13
0
            self.timestep = json.loads(f.read())["timestep"]


register_trainable("my_class", MyTrainableClass)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--smoke-test",
                        action="store_true",
                        help="Finish quickly for testing")
    args, _ = parser.parse_known_args()
    ray.init()

    # Hyperband early stopping, configured with `episode_reward_mean` as the
    # objective and `timesteps_total` as the time unit.
    hyperband = HyperBandScheduler(time_attr="timesteps_total",
                                   reward_attr="episode_reward_mean",
                                   max_t=100)

    exp = Experiment(
        name="hyperband_test",
        run="my_class",
        repeat=20,
        stop={"training_iteration": 1 if args.smoke_test else 99999},
        config={
            "width": lambda spec: 10 + int(90 * random.random()),
            "height": lambda spec: int(100 * random.random())
        })

    run_experiments(exp, scheduler=hyperband)
Beispiel #14
0
            register_trainable(
                name, lambda augs, reporter: eval_tta(copy.deepcopy(copied_c),
                                                      augs, reporter))
            algo = HyperOptSearch(space, metric=reward_attr, mode="max")
            algo = ConcurrencyLimiter(algo,
                                      max_concurrent=num_process_per_gpu *
                                      torch.cuda.device_count())

            experiment_spec = Experiment(
                name,
                run=name,
                num_samples=args.num_search,  # if r == args.repeat-1 else 25,
                resources_per_trial={'gpu': 1. / num_process_per_gpu},
                stop={'training_iteration': args.iter},
                config={
                    'dataroot': args.dataroot,
                    'save_path': paths[cv_fold],
                    'cv_ratio_test': args.cv_ratio,
                    'cv_fold': cv_fold,
                    'num_op': args.num_op,
                    'num_policy': args.num_policy
                },
                local_dir=os.path.join(base_path, "ray_results"),
            )
            analysis = run(experiment_spec,
                           search_alg=algo,
                           scheduler=None,
                           verbose=0,
                           queue_trials=True,
                           resume=args.resume,
                           raise_on_failed_trial=False,
                           global_checkpoint_period=np.inf)
Beispiel #15
0
        tune.grid_search(['grad_inp', 'grad_latent', 'no_reg', 'grad_mc'])
    }

    config = {}
    for d in (_config, ukn_args):
        config.update(d)

    tune.register_trainable("train_dnn", train_with_dic)

    ray.init()
    exp = Experiment(
        name=args.name,
        run="train_dnn",
        trial_resources={
            "cpu": 6,
            "gpu": 1
        },
        config=config,
        local_dir="./ray_results",
        repeat=args.repeat,
        max_failures=args.max_failure,
        stop={"training_iteration": 1 if args.smoke_test else 99999})

    ahb = AsyncHyperBandScheduler(time_attr="timesteps_total",
                                  reward_attr="mean_accuracy",
                                  grace_period=40,
                                  max_t=200)

    # run_experiments(exp, verbose=args.verbose, scheduler=ahb)

    run_experiments(exp, verbose=args.verbose)
Beispiel #16
0
 for _ in range(args.repeat):  # run multiple times.
     for gr_id in range(gr_num):
         for cv_id in range(cv_num):
             final_policy_set = []
             name = "search_%s_%s_group%d_%d_cv%d_ratio%.1f" % (C.get()['dataset'], C.get()['model']['type'], gr_id, gr_num, cv_id, args.cv_ratio)
             print(name)
             bo_log_file = open(os.path.join(base_path, name+"_bo_result.csv"), "w", newline="")
             wr = csv.writer(bo_log_file)
             wr.writerow(result_to_save)
             register_trainable(name, lambda augs, reporter: eval_tta2(copy.deepcopy(copied_c), augs, reporter))
             algo = HyperOptSearch(space, metric=reward_attr, mode="max")
             algo = ConcurrencyLimiter(algo, max_concurrent=num_process_per_gpu*torch.cuda.device_count())
             experiment_spec = Experiment(
                 name,
                 run=name,
                 num_samples=args.num_search,# if r == args.repeat-1 else 25,
                 resources_per_trial={'gpu': 1./num_process_per_gpu},
                 stop={'training_iteration': args.iter},
                     'config': {
                         'dataroot': args.dataroot, 'save_path': paths[cv_id],
                         'cv_ratio_test': args.cv_ratio, 'cv_id': cv_id,
                         'num_op': args.num_op, 'num_policy': args.num_policy,
                         "gr_assign": gr_assign, "gr_id": gr_id
                 },
                 local_dir=os.path.join(base_path, "ray_results"),
                 )
             analysis = run(experiment_spec, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, resume=args.resume, raise_on_failed_trial=False,
                             global_checkpoint_period=np.inf)
             results = analysis.trials
             print()
             results = [x for x in results if x.last_result]
Beispiel #17
0
            self.timestep = json.loads(f.read())["timestep"]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--smoke-test",
                        action="store_true",
                        help="Finish quickly for testing")
    args, _ = parser.parse_known_args()
    ray.init()

    # Hyperband early stopping, configured with `episode_reward_mean` as the
    # objective and `training_iteration` as the time unit,
    # which is automatically filled by Tune.
    hyperband = HyperBandScheduler(time_attr="training_iteration",
                                   metric="episode_reward_mean",
                                   mode="max",
                                   max_t=100)

    exp = Experiment(
        name="hyperband_test",
        run=MyTrainableClass,
        num_samples=20,
        stop={"training_iteration": 1 if args.smoke_test else 99999},
        config={
            "width": sample_from(lambda spec: 10 + int(90 * random.random())),
            "height": sample_from(lambda spec: int(100 * random.random()))
        })

    run(exp, scheduler=hyperband)
Beispiel #18
0
    # Load dev set
    dev_set = load_dev_set(args)
    dev_set_id = pin_in_object_store(dev_set)
    print("Loaded dev. pinned={}".format(True))

    tune.register_trainable('train_sc', train.Trainer)
    exp = Experiment(
        name="speaker classification",
        run='train_sc',
        config={
            "stop": {
                'training_iteration': 500
            },
            # "lr": ray.tune.grid_search([1e-1, 1e-2]),
            "lr": 1e-2,
            "alpha": 0.9,
            "train_set_id": train_set_id,
            "dev_set_id": dev_set_id,
            "nspeakers": nspeakers,
            # "batch_size": ray.tune.grid_search([128, 64, 32]),
            "batch_size": 32,
            "data_parallel": args.data_parallel
        },
        trial_resources={
            "cpu": 20,
            "gpu": 1
        },
    )

    tune.run_experiments(exp, with_server=True, server_port=4321)
def ray_train(cfg, pl_module_cls):
    # We need Munch to hold tune functions. DictConfig can only hold static config.
    cfg = munchconfig_to_tune_munchconfig(dictconfig_to_munch(cfg))
    ray_config = {
        'model': cfg.model,
        'dataset': cfg.dataset,
        'train': cfg.train,
        'seed': cfg.seed,
        'wandb': cfg.wandb,
        'gpu': cfg.runner.gpu_per_trial != 0.0,
    }
    dataset_str = cfg.dataset._target_.split('.')[-1]
    model_str = cfg.model._target_.split('.')[-1]
    args_str = '_'
    # If we're writing to dfs or efs already, no need to sync explicitly
    # This needs to be a noop function, not just False. If False, ray won't restore failed spot instances
    sync_to_driver = None if not cfg.runner.nfs else lambda source, target: None
    experiment = Experiment(
        name=f'{dataset_str}_{model_str}',
        run=partial(pl_train_with_tune, pl_module_cls=pl_module_cls),
        local_dir=cfg.runner.result_dir,
        num_samples=cfg.runner.ntrials if not cfg.smoke_test else 1,
        resources_per_trial={
            'cpu': 1 + cfg.dataset.num_workers,
            'gpu': cfg.runner.gpu_per_trial
        },
        # epochs + 1 because calling trainer.test(model) counts as one epoch
        stop={
            "training_iteration": 1 if cfg.smoke_test else cfg.train.epochs + 1
        },
        config=ray_config,
        loggers=[WandbLogger],
        keep_checkpoints_num=1,  # Save disk space, just need 1 for recovery
        # checkpoint_at_end=True,
        # checkpoint_freq=1000,  # Just to enable recovery with @max_failures
        max_failures=-1,
        sync_to_driver=sync_to_driver,  # As of Ray 1.0.0, still need this here
    )

    if cfg.smoke_test or cfg.runner.local:
        ray.init(num_gpus=torch.cuda.device_count())
    else:
        try:
            ray.init(address='auto')
        except:
            try:
                with open(project_root / 'ray_config/redis_address', 'r') as f:
                    address = f.read().strip()
                with open(project_root / 'ray_config/redis_password',
                          'r') as f:
                    password = f.read().strip()
                    ray.init(address=address, _redis_password=password)
            except:
                ray.init(num_gpus=torch.cuda.device_count())
                import warnings
                warnings.warn("Running Ray with just one node")

    if cfg.runner.hyperband:
        scheduler = AsyncHyperBandScheduler(
            metric='mean_accuracy',
            mode='max',
            max_t=cfg.train.epochs + 1,
            grace_period=cfg.runner.grace_period)
    else:
        scheduler = None
    trials = ray.tune.run(
        experiment,
        scheduler=scheduler,
        # sync_config=SyncConfig(sync_to_driver=sync_to_driver),
        raise_on_failed_trial=False,
        queue_trials=True)
    return trials
ray.init()
register_env(env_name, lambda config: CollectMineralsAndGas())
experiment_spec = Experiment(
    experiment_name,  #experiment name to log
    "DQN",  #model to be used
    checkpoint_freq=100,  #save model each 100th iteration
    stop={
        "training_iteration": 300,  #stop model training after 300 iteration
    },
    config={
        "env": env_name,
        "framework": "tensorflow",  # used framework
        "buffer_size": 50000,
        "timesteps_per_iteration": 1000,
        "n_step": 3,
        "prioritized_replay": True,
        "grad_clip": None,
        "num_workers": 1,
        "num_gpus": 1,  # use gpu
        "exploration_config": {
            "type": "EpsilonGreedy",  # use EpsilonGreedy for exploration
            "initial_epsilon": 1.0,
            "final_epsilon": 0.02,
            "epsilon_timesteps": 1000
        }
    },
)

run_experiments(experiment_spec)
Beispiel #21
0
        with open(path, "w") as f:
            f.write(json.dumps({"timestep": self.timestep}))
        return path

    def _restore(self, checkpoint_path):
        with open(checkpoint_path) as f:
            self.timestep = json.loads(f.read())["timestep"]


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--smoke-test",
                        action="store_true",
                        help="Finish quickly for testing")
    args, _ = parser.parse_known_args()
    ray.init()
    exp = Experiment(
        name="hyperband_test",
        run=MyTrainableClass,
        num_samples=1,
        trial_name_creator=tune.function(trial_str_creator),
        loggers=[TestLogger],
        stop={"training_iteration": 1 if args.smoke_test else 99999},
        config={
            "width":
            tune.sample_from(lambda spec: 10 + int(90 * random.random())),
            "height": tune.sample_from(lambda spec: int(100 * random.random()))
        })

    trials = run_experiments(exp)
Beispiel #22
0
        config["multiagent"] = {
            'policy_graphs': policy_graph,
            'policy_mapping_fn': tune.function(lambda agent_id: POLICY_ID),
            'policies_to_train': [POLICY_ID]
        }

        env_name_list.append(env_name)
        config_list.append(config)
        # Register as rllib env
        register_env(env_name, create_env)

    exp_list = []
    for config, env_name in zip(config_list, env_name_list):
        exp_tag = {
            "run": alg_run,
            "env": env_name,
            "config": {
                **config
            },
            "checkpoint_freq": 10,
            "max_failures": 999,
            "stop": {
                "training_iteration": 50
            },
            "num_samples": 6,
        }
        exp_list.append(Experiment.from_json(args.exp_tag, exp_tag))

    trials = run_experiments(experiments=exp_list)
def run_experiment(args, parser):
    # args.ray_object_store_memory = int(1e10)
    args.ray_redis_max_memory = int(2e9)

    if args.config_file:
        with open(args.config_file) as f:
            exp = yaml.load(f)
    else:
        raise Exception('No config file!')

    exp = merge_dicts(exp, args.config)
    log.info('Num workers: %d, num_envs_per_worker: %d',
             exp['config']['num_workers'],
             exp['config']['num_envs_per_worker'])

    if args.cfg_mixins is not None:
        for cfg_mixin_file in args.cfg_mixins:
            with open(cfg_mixin_file, 'r') as f:
                override_cfg = yaml.load(f)
                log.info('Overriding parameters from %s: %r', cfg_mixin_file,
                         override_cfg)
                exp = merge_dicts(exp, override_cfg)

    if not exp.get("run"):
        parser.error("the following arguments are required: --run")
    if not exp.get("env") and not exp.get("config", {}).get("env"):
        parser.error("the following arguments are required: --env")

    if args.ray_num_nodes:
        cluster = Cluster()
        for _ in range(args.ray_num_nodes):
            cluster.add_node(
                num_cpus=args.ray_num_cpus or 1,
                num_gpus=args.ray_num_gpus or 0,
                object_store_memory=args.ray_object_store_memory,
                redis_max_memory=args.ray_redis_max_memory,
            )
        ray.init(redis_address=cluster.redis_address,
                 local_mode=args.local_mode)
    else:
        ray.init(
            redis_address=args.redis_address,
            object_store_memory=args.ray_object_store_memory,
            redis_max_memory=args.ray_redis_max_memory,
            num_cpus=args.ray_num_cpus,
            num_gpus=args.ray_num_gpus,
            local_mode=args.local_mode,
        )

    exp = Experiment.from_json(args.experiment_name, exp)
    exp.spec['checkpoint_freq'] = 20
    if args.pbt:
        exp.spec['checkpoint_freq'] = 3

    exp.spec['checkpoint_at_end'] = True
    # exp.spec['checkpoint_score_attr'] = 'episode_reward_mean'
    exp.spec['keep_checkpoints_num'] = 5

    if args.stop_seconds > 0:
        exp.spec['stop'] = {'time_total_s': args.stop_seconds}

    # if 'multiagent' in exp.spec['config']:
    #     # noinspection PyProtectedMember
    #     make_env = ray.tune.registry._global_registry.get(ENV_CREATOR, exp.spec['config']['env'])
    #     temp_env = make_env(None)
    #     obs_space, action_space = temp_env.observation_space, temp_env.action_space
    #     temp_env.close()
    #     del temp_env
    #
    #     policies = dict(
    #         main=(None, obs_space, action_space, {}),
    #         dummy=(None, obs_space, action_space, {}),
    #     )
    #
    #     exp.spec['config']['multiagent'] = {
    #         'policies': policies,
    #         'policy_mapping_fn': function(lambda agent_id: 'main'),
    #         'policies_to_train': ['main'],
    #     }
    #
    # if args.dbg:
    #     exp.spec['config']['num_workers'] = 1
    #     exp.spec['config']['num_gpus'] = 1
    #     exp.spec['config']['num_envs_per_worker'] = 1
    #
    # if 'callbacks' not in exp.spec['config']:
    #     exp.spec['config']['callbacks'] = {}
    #
    # fps_helper = FpsHelper()
    #
    # def on_train_result(info):
    #     if 'APPO' in exp.spec['run']:
    #         samples = info['result']['info']['num_steps_sampled']
    #     else:
    #         samples = info['trainer'].optimizer.num_steps_trained
    #
    #     fps_helper.record(samples)
    #     fps = fps_helper.get_fps()
    #     info['result']['custom_metrics']['fps'] = fps
    #
    #     # remove this as currently
    #     skip_frames = exp.spec['config']['env_config']['skip_frames']
    #     info['result']['custom_metrics']['fps_frameskip'] = fps * skip_frames
    #
    # exp.spec['config']['callbacks']['on_train_result'] = function(on_train_result)
    #
    # def on_episode_end(info):
    #     episode = info['episode']
    #     stats = {
    #         'DEATHCOUNT': 0,
    #         'FRAGCOUNT': 0,
    #         'HITCOUNT': 0,
    #         'DAMAGECOUNT': 0,
    #         'KDR': 0,
    #         'FINAL_PLACE': 0,
    #         'LEADER_GAP': 0,
    #         'PLAYER_COUNT': 0,
    #         'BOT_DIFFICULTY': 0,
    #     }
    #
    #     # noinspection PyProtectedMember
    #     agent_to_last_info = episode._agent_to_last_info
    #     for agent in agent_to_last_info.keys():
    #         agent_info = agent_to_last_info[agent]
    #         for stats_key in stats.keys():
    #             stats[stats_key] += agent_info.get(stats_key, 0.0)
    #
    #     for stats_key in stats.keys():
    #         stats[stats_key] /= len(agent_to_last_info.keys())
    #
    #     episode.custom_metrics.update(stats)
    #
    # exp.spec['config']['callbacks']['on_episode_end'] = function(on_episode_end)

    extra_kwargs = {}
    if args.pbt:
        extra_kwargs['reuse_actors'] = False

    run(exp,
        name=args.experiment_name,
        scheduler=make_custom_scheduler(args),
        resume=args.resume,
        queue_trials=args.queue_trials,
        **extra_kwargs)
def main(args):

    ray.init(num_cpus=args.rayNumCpu, num_gpus=args.rayNumGpu)

    t_loader, v_loader = get_loaders(train_batch_size=16,
                                     num_workers=1,
                                     data_folder=args.dataFolder,
                                     cuda_available=torch.cuda.is_available())
    pinned_obj_dict['data_loader_train'] = pin_in_object_store(t_loader)
    pinned_obj_dict['data_loader_valid'] = pin_in_object_store(v_loader)
    pinned_obj_dict['args'] = pin_in_object_store(args)

    trainable_name = 'hyp_search_train'
    register_trainable(trainable_name, TrainerClass)

    reward_attr = "acc"

    #############################
    # Define hyperband scheduler
    #############################
    hpb = AsyncHyperBandScheduler(time_attr="training_iteration",
                                  reward_attr=reward_attr,
                                  grace_period=40,
                                  max_t=300)

    ##############################
    # Define hyperopt search algo
    ##############################
    space = {
        'lr': hp.uniform('lr', 0.001, 0.1),
        'optimizer':
        hp.choice("optimizer",
                  ['SGD', 'Adam'
                   ]),  #, 'Adadelta']), # Adadelta gets the worst results
        'batch_accumulation': hp.choice("batch_accumulation", [4, 8, 16])
    }
    hos = HyperOptSearch(space, max_concurrent=4, reward_attr=reward_attr)

    #####################
    # Define experiments
    #####################
    exp_name = "resnet152_hyp_search_hyperband_hyperopt_{}".format(
        time.strftime("%Y-%m-%d_%H.%M.%S"))
    exp = Experiment(
        name=exp_name,
        run=trainable_name,
        num_samples=args.numSamples,  # the number of experiments
        resources_per_trial={
            "cpu": args.trialNumCpu,
            "gpu": args.trialNumGpu
        },
        checkpoint_freq=args.checkpointFreq,
        checkpoint_at_end=True,
        stop={
            reward_attr: 0.95,
            "training_iteration": args.
            trainingIteration,  # how many times a specific config will be trained
        })

    ##################
    # Run tensorboard
    ##################
    if args.runTensorBoard:
        thread = threading.Thread(target=launch_tensorboard, args=[exp_name])
        thread.start()
        launch_tensorboard(exp_name)

    ##################
    # Run experiments
    ##################
    run_experiments(exp, search_alg=hos, scheduler=hpb, verbose=False)