Esempio n. 1
0
    def add_configurations(self,
                           experiments: Union[Experiment, List[Experiment],
                                              Dict[str, Dict]]):
        """Chains generator given experiment specifications.

        Arguments:
            experiments (Experiment | list | dict): Experiments to run.
        """
        experiment_list = convert_to_experiment_list(experiments)
        for experiment in experiment_list:
            grid_vals = count_spec_samples(experiment.spec, num_samples=1)
            lazy_eval = grid_vals > SERIALIZATION_THRESHOLD
            if lazy_eval:
                warnings.warn(
                    f"The number of pre-generated samples ({grid_vals}) "
                    "exceeds the serialization threshold "
                    f"({int(SERIALIZATION_THRESHOLD)}). Resume ability is "
                    "disabled. To fix this, reduce the number of "
                    "dimensions/size of the provided grid search.")

            previous_samples = self._total_samples
            points_to_evaluate = copy.deepcopy(self._points_to_evaluate)
            self._total_samples += count_variants(experiment.spec,
                                                  points_to_evaluate)
            iterator = _TrialIterator(uuid_prefix=self._uuid_prefix,
                                      num_samples=experiment.spec.get(
                                          "num_samples", 1),
                                      unresolved_spec=experiment.spec,
                                      output_path=experiment.dir_name,
                                      points_to_evaluate=points_to_evaluate,
                                      lazy_eval=lazy_eval,
                                      start=previous_samples)
            self._iterators.append(iterator)
            self._trial_generator = itertools.chain(self._trial_generator,
                                                    iterator)
Esempio n. 2
0
def run_experiments(experiments: Union[Experiment, Mapping,
                                       Sequence[Union[Experiment, Mapping]]],
                    scheduler: Optional[TrialScheduler] = None,
                    server_port: Optional[int] = None,
                    verbose: Union[int,
                                   Verbosity] = Verbosity.V3_TRIAL_DETAILS,
                    progress_reporter: Optional[ProgressReporter] = None,
                    resume: bool = False,
                    queue_trials: bool = False,
                    reuse_actors: bool = False,
                    trial_executor: Optional[RayTrialExecutor] = None,
                    raise_on_failed_trial: bool = True,
                    concurrent: bool = True,
                    callbacks: Optional[Sequence[Callback]] = None):
    """Runs and blocks until all trials finish.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)

    if concurrent:
        return run(experiments,
                   server_port=server_port,
                   verbose=verbose,
                   progress_reporter=progress_reporter,
                   resume=resume,
                   queue_trials=queue_trials,
                   reuse_actors=reuse_actors,
                   trial_executor=trial_executor,
                   raise_on_failed_trial=raise_on_failed_trial,
                   scheduler=scheduler,
                   callbacks=callbacks).trials
    else:
        trials = []
        for exp in experiments:
            trials += run(exp,
                          server_port=server_port,
                          verbose=verbose,
                          progress_reporter=progress_reporter,
                          resume=resume,
                          queue_trials=queue_trials,
                          reuse_actors=reuse_actors,
                          trial_executor=trial_executor,
                          raise_on_failed_trial=raise_on_failed_trial,
                          scheduler=scheduler,
                          callbacks=callbacks).trials
        return trials
Esempio n. 3
0
def run_experiments(experiments,
                    scheduler=None,
                    server_port=None,
                    verbose=2,
                    progress_reporter=None,
                    resume=False,
                    queue_trials=False,
                    reuse_actors=False,
                    trial_executor=None,
                    raise_on_failed_trial=True,
                    concurrent=True,
                    callbacks=None):
    """Runs and blocks until all trials finish.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)

    if concurrent:
        return run(experiments,
                   server_port=server_port,
                   verbose=verbose,
                   progress_reporter=progress_reporter,
                   resume=resume,
                   queue_trials=queue_trials,
                   reuse_actors=reuse_actors,
                   trial_executor=trial_executor,
                   raise_on_failed_trial=raise_on_failed_trial,
                   scheduler=scheduler,
                   callbacks=callbacks).trials
    else:
        trials = []
        for exp in experiments:
            trials += run(exp,
                          server_port=server_port,
                          verbose=verbose,
                          progress_reporter=progress_reporter,
                          resume=resume,
                          queue_trials=queue_trials,
                          reuse_actors=reuse_actors,
                          trial_executor=trial_executor,
                          raise_on_failed_trial=raise_on_failed_trial,
                          scheduler=scheduler,
                          callbacks=callbacks).trials
        return trials
Esempio n. 4
0
    def network_debug(self):

        logger = logging.getLogger("detectron2.trainer")

        # inference
        SearchTrainer.test_policies(self.cfg, self.model, None, self.k_th,
                                    self.K_fold)

        # search by explore and exploit
        logger.info("Step2: search best policies")
        name = "search_fold%d" % (self.k_th)
        register_trainable(name, lambda augs, rpt: search_debug(augs, rpt))

        # search algorithm
        algo = HyperOptSearch(self.space,
                              max_concurrent=4 * 20,
                              metric=self.metric,
                              mode=self.mode)  # top1_valid or minus_loss

        # experiments configuration
        exp_config = {
            name: {
                'run': name,
                'num_samples': 4,
                'resources_per_trial': self.resources_per_trial,
                'stop': {
                    'training_iteration': self.num_policy
                },
                'config': {
                    "cfg": self.cfg,
                    "k_th": self.k_th,
                    "K_fold": self.K_fold
                }
            }
        }

        # bayes optimization search
        # results = run_experiments(exp_config, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, raise_on_failed_trial=False)
        results = run(
            convert_to_experiment_list(exp_config),
            name=name,
            search_alg=algo,
            resources_per_trial=self.resources_per_trial,
            return_trials=True,
            verbose=0,
            queue_trials=True,
            raise_on_failed_trial=False,
        )

        # sort
        results = [x for x in results if x.last_result is not None]
        results = sorted(results,
                         key=lambda x: x.last_result[self.metric],
                         reverse=True)

        return []
Esempio n. 5
0
 def testConvertExperimentList(self):
     exp1 = Experiment(**{
         "name": "foo",
         "run": "f1",
         "config": {
             "script_min_iter_time_s": 0
         }
     })
     result = convert_to_experiment_list([exp1, exp1])
     self.assertEqual(len(result), 2)
     self.assertEqual(type(result), list)
Esempio n. 6
0
def run_experiments(experiments,
                    search_alg=None,
                    scheduler=None,
                    with_server=False,
                    server_port=TuneServer.DEFAULT_PORT,
                    verbose=2,
                    resume=False,
                    queue_trials=False,
                    reuse_actors=False,
                    trial_executor=None,
                    raise_on_failed_trial=True):
    """Runs and blocks until all trials finish.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     scheduler=MedianStoppingRule(...))

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     search_alg=SearchAlgorithm(),
        >>>     scheduler=MedianStoppingRule(...))

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)

    trials = []
    for exp in experiments:
        trials += run(
            exp,
            search_alg=search_alg,
            scheduler=scheduler,
            with_server=with_server,
            server_port=server_port,
            verbose=verbose,
            resume=resume,
            queue_trials=queue_trials,
            reuse_actors=reuse_actors,
            trial_executor=trial_executor,
            raise_on_failed_trial=raise_on_failed_trial,
            return_trials=True)
    return trials
Esempio n. 7
0
 def testConvertExperimentList(self):
     exp1 = Experiment(**{
         "name": "foo",
         "run": "f1",
         "config": {
             "script_min_iter_time_s": 0
         }
     })
     result = convert_to_experiment_list([exp1, exp1])
     self.assertEqual(len(result), 2)
     self.assertEqual(type(result), list)
Esempio n. 8
0
    def add_configurations(self, experiments):
        """Chains generator given experiment specifications.

        Arguments:
            experiments (Experiment | list | dict): Experiments to run.
        """
        experiment_list = convert_to_experiment_list(experiments)
        for experiment in experiment_list:
            self._trial_generator = itertools.chain(
                self._trial_generator,
                self._generate_trials(experiment.spec, experiment.name))
Esempio n. 9
0
    def add_configurations(self, experiments):
        """Chains generator given experiment specifications.

        Arguments:
            experiments (Experiment | list | dict): Experiments to run.
        """
        experiment_list = convert_to_experiment_list(experiments)
        for experiment in experiment_list:
            self._trial_generator = itertools.chain(
                self._trial_generator,
                self._generate_trials(experiment.spec, experiment.name))
Esempio n. 10
0
    def __init__(self, experiments=None):
        """Constructs a generator given experiment specifications.

        Arguments:
            experiments (Experiment | list | dict): Experiments to run.
        """
        experiment_list = convert_to_experiment_list(experiments)
        self._parser = make_parser()
        self._trial_generator = chain.from_iterable([
            self._generate_trials(experiment.spec, experiment.name)
            for experiment in experiment_list
        ])
        self._finished = False
Esempio n. 11
0
    def add_configurations(self, experiments):
        """Chains generator given experiment specifications.

        Multiplies the number of trials by the repeat factor.

        Arguments:
            experiments (Experiment | list | dict): Experiments to run.
        """
        experiment_list = convert_to_experiment_list(experiments)
        for experiment in experiment_list:
            self._trial_generator = itertools.chain(
                self._trial_generator,
                self._generate_trials(
                    experiment.spec.get("num_samples", 1) * self._repeat,
                    experiment.spec, experiment.name))
Esempio n. 12
0
    def add_configurations(
            self,
            experiments: Union[Experiment, List[Experiment], Dict[str, Dict]]):
        """Chains generator given experiment specifications.

        Arguments:
            experiments (Experiment | list | dict): Experiments to run.
        """
        experiment_list = convert_to_experiment_list(experiments)
        for experiment in experiment_list:
            self._trial_generator = itertools.chain(
                self._trial_generator,
                self._generate_trials(
                    experiment.spec.get("num_samples", 1), experiment.spec,
                    experiment.name))
Esempio n. 13
0
    def add_configurations(self, experiments):
        """Registers experiment specifications.

        Arguments:
            experiments (Experiment | list | dict): Experiments to run.
        """
        logger.debug("added configurations")
        experiment_list = convert_to_experiment_list(experiments)
        assert len(experiment_list) == 1, (
            "SearchAlgorithms can only support 1 experiment at a time.")
        self._experiment = experiment_list[0]
        experiment_spec = self._experiment.spec
        self._total_samples = experiment_spec.get("num_samples", 1)

        _warn_on_repeater(self.searcher, self._total_samples)

        if "run" not in experiment_spec:
            raise TuneError("Must specify `run` in {}".format(experiment_spec))
Esempio n. 14
0
 def testConvertExperimentJSON(self):
     experiment = {
         "name": {
             "run": "f1",
             "config": {
                 "script_min_iter_time_s": 0
             }
         },
         "named": {
             "run": "f1",
             "config": {
                 "script_min_iter_time_s": 0
             }
         }
     }
     result = convert_to_experiment_list(experiment)
     self.assertEqual(len(result), 2)
     self.assertEqual(type(result), list)
Esempio n. 15
0
 def testConvertExperimentJSON(self):
     experiment = {
         "name": {
             "run": "f1",
             "config": {
                 "script_min_iter_time_s": 0
             }
         },
         "named": {
             "run": "f1",
             "config": {
                 "script_min_iter_time_s": 0
             }
         }
     }
     result = convert_to_experiment_list(experiment)
     self.assertEqual(len(result), 2)
     self.assertEqual(type(result), list)
Esempio n. 16
0
    def add_configurations(self,
                           experiments: Union[Experiment, List[Experiment],
                                              Dict[str, Dict]]):
        """Chains generator given experiment specifications.

        Arguments:
            experiments (Experiment | list | dict): Experiments to run.
        """
        experiment_list = convert_to_experiment_list(experiments)
        for experiment in experiment_list:
            points_to_evaluate = copy.deepcopy(self._points_to_evaluate)
            self._total_samples += count_variants(experiment.spec,
                                                  points_to_evaluate)
            self._trial_generator = itertools.chain(
                self._trial_generator,
                self._generate_trials(experiment.spec.get("num_samples",
                                                          1), experiment.spec,
                                      experiment.dir_name, points_to_evaluate))
Esempio n. 17
0
def train_SAC(env, eval_env, out_dir, seed=None, **kwargs):

    ray.init(
        local_mode=kwargs['local'],
        address=(kwargs['ray_address'] if 'ray_address' in kwargs else None),
        ignore_reinit_error=True,
        log_to_driver=False,
        webui_host="0.0.0.0",
    )

    # Get the experiments from the configuration file
    experiments = convert_to_experiment_list(kwargs)

    if len(experiments) == 0:
        raise ValueError("No experiments found")
    elif len(experiments) > 1:
        raise ValueError("Multiple experiments not yet supported")

    # Get the first experiment
    experiment = experiments[0]

    # TODO: define callbacks
    # Create the callback field if it does not exist
    if "callbacks" not in experiment.spec["config"]:
        experiment.spec["config"]["callbacks"] = {}

    callbacks = experiment.spec["config"]["callbacks"]

    checkpoint = None
    if 'checkpoint' in kwargs:
        checkpoint = kwargs['checkpoint']

    print(f"Running experiment:")
    pp.pprint(experiment.spec)

    trials = ray.tune.run(
        experiment,
        resume=kwargs['continue'] if 'continue' in kwargs else False,
        restore=checkpoint,
        return_trials=True,
    )


    return trials
Esempio n. 18
0
def run_experiments(
        experiments: Union[Experiment, Mapping, Sequence[Union[Experiment,
                                                               Mapping]]],
        scheduler: Optional[TrialScheduler] = None,
        server_port: Optional[int] = None,
        verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS,
        progress_reporter: Optional[ProgressReporter] = None,
        resume: bool = False,
        reuse_actors: bool = False,
        trial_executor: Optional[RayTrialExecutor] = None,
        raise_on_failed_trial: bool = True,
        concurrent: bool = True,
        # Deprecated args.
        queue_trials: Optional[bool] = None,
        callbacks: Optional[Sequence[Callback]] = None,
        _remote: Optional[bool] = None):
    """Runs and blocks until all trials finish.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    # To be removed in 1.9.
    if queue_trials is not None:
        raise DeprecationWarning(
            "`queue_trials` has been deprecated and is replaced by "
            "the `TUNE_MAX_PENDING_TRIALS_PG` environment variable. "
            "Per default at least one Trial is queued at all times, "
            "so you likely don't need to change anything other than "
            "removing this argument from your call to `tune.run()`")

    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        remote_run = ray.remote(num_cpus=0)(run_experiments)

        # Make sure tune.run_experiments is run on the server node.
        remote_run = force_on_current_node(remote_run)

        return ray.get(
            remote_run.remote(
                experiments,
                scheduler,
                server_port,
                verbose,
                progress_reporter,
                resume,
                reuse_actors,
                trial_executor,
                raise_on_failed_trial,
                concurrent,
                callbacks,
                _remote=False))

    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)

    if concurrent:
        return run(
            experiments,
            server_port=server_port,
            verbose=verbose,
            progress_reporter=progress_reporter,
            resume=resume,
            reuse_actors=reuse_actors,
            trial_executor=trial_executor,
            raise_on_failed_trial=raise_on_failed_trial,
            scheduler=scheduler,
            callbacks=callbacks).trials
    else:
        trials = []
        for exp in experiments:
            trials += run(
                exp,
                server_port=server_port,
                verbose=verbose,
                progress_reporter=progress_reporter,
                resume=resume,
                reuse_actors=reuse_actors,
                trial_executor=trial_executor,
                raise_on_failed_trial=raise_on_failed_trial,
                scheduler=scheduler,
                callbacks=callbacks).trials
        return trials
Esempio n. 19
0
def run_experiments(experiments: Union[Experiment, Mapping,
                                       Sequence[Union[Experiment, Mapping]]],
                    scheduler: Optional[TrialScheduler] = None,
                    server_port: Optional[int] = None,
                    verbose: Union[int,
                                   Verbosity] = Verbosity.V3_TRIAL_DETAILS,
                    progress_reporter: Optional[ProgressReporter] = None,
                    resume: bool = False,
                    queue_trials: bool = False,
                    reuse_actors: bool = False,
                    trial_executor: Optional[RayTrialExecutor] = None,
                    raise_on_failed_trial: bool = True,
                    concurrent: bool = True,
                    callbacks: Optional[Sequence[Callback]] = None,
                    _remote: bool = None):
    """Runs and blocks until all trials finish.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    if _remote is None:
        _remote = ray.util.client.ray.is_connected()

    if _remote is True and trial_executor:
        raise ValueError("cannot use custom trial executor")

    if not trial_executor or isinstance(trial_executor, RayTrialExecutor):
        _ray_auto_init()

    if _remote:
        remote_run = ray.remote(num_cpus=0)(run_experiments)

        # Make sure tune.run_experiments is run on the server node.
        remote_run = force_on_current_node(remote_run)

        return ray.get(
            remote_run.remote(experiments,
                              scheduler,
                              server_port,
                              verbose,
                              progress_reporter,
                              resume,
                              queue_trials,
                              reuse_actors,
                              trial_executor,
                              raise_on_failed_trial,
                              concurrent,
                              callbacks,
                              _remote=False))

    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)

    if concurrent:
        return run(experiments,
                   server_port=server_port,
                   verbose=verbose,
                   progress_reporter=progress_reporter,
                   resume=resume,
                   queue_trials=queue_trials,
                   reuse_actors=reuse_actors,
                   trial_executor=trial_executor,
                   raise_on_failed_trial=raise_on_failed_trial,
                   scheduler=scheduler,
                   callbacks=callbacks).trials
    else:
        trials = []
        for exp in experiments:
            trials += run(exp,
                          server_port=server_port,
                          verbose=verbose,
                          progress_reporter=progress_reporter,
                          resume=resume,
                          queue_trials=queue_trials,
                          reuse_actors=reuse_actors,
                          trial_executor=trial_executor,
                          raise_on_failed_trial=raise_on_failed_trial,
                          scheduler=scheduler,
                          callbacks=callbacks).trials
        return trials
Esempio n. 20
0
 def testConvertExperimentIncorrect(self):
     self.assertRaises(TuneError, lambda: convert_to_experiment_list("hi"))
Esempio n. 21
0
 def testConvertExperimentIncorrect(self):
     self.assertRaises(TuneError, lambda: convert_to_experiment_list("hi"))
Esempio n. 22
0
 def testConvertExperimentNone(self):
     result = convert_to_experiment_list(None)
     self.assertEqual(len(result), 0)
     self.assertEqual(type(result), list)
Esempio n. 23
0
def run_experiments(experiments,
                    search_alg=None,
                    scheduler=None,
                    with_server=False,
                    server_port=TuneServer.DEFAULT_PORT,
                    verbose=True,
                    resume=False,
                    queue_trials=False,
                    trial_executor=None,
                    raise_on_failed_trial=True):
    """Runs and blocks until all trials finish.

    Args:
        experiments (Experiment | list | dict): Experiments to run. Will be
            passed to `search_alg` via `add_configurations`.
        search_alg (SearchAlgorithm): Search Algorithm. Defaults to
            BasicVariantGenerator.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, and HyperBand.
        with_server (bool): Starts a background Tune server. Needed for
            using the Client API.
        server_port (int): Port number for launching TuneServer.
        verbose (bool): How much output should be printed for each trial.
        resume (bool|"prompt"): If checkpoint exists, the experiment will
            resume from there. If resume is "prompt", Tune will prompt if
            checkpoint detected.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     scheduler=MedianStoppingRule(...))

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     search_alg=SearchAlgorithm(),
        >>>     scheduler=MedianStoppingRule(...))

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)
    checkpoint_dir = _find_checkpoint_dir(experiments)

    runner = None
    restore = False

    if os.path.exists(
            os.path.join(checkpoint_dir, TrialRunner.CKPT_FILE_NAME)):
        if resume == "prompt":
            msg = ("Found incomplete experiment at {}. "
                   "Would you like to resume it?".format(checkpoint_dir))
            restore = click.confirm(msg, default=False)
            if restore:
                logger.info("Tip: to always resume, "
                            "pass resume=True to run_experiments()")
            else:
                logger.info("Tip: to always start a new experiment, "
                            "pass resume=False to run_experiments()")
        elif resume:
            restore = True
        else:
            logger.info(
                "Tip: to resume incomplete experiments, "
                "pass resume='prompt' or resume=True to run_experiments()")
    else:
        logger.info(
            "Did not find checkpoint file in {}.".format(checkpoint_dir))

    if restore:
        runner = try_restore_runner(checkpoint_dir, search_alg, scheduler,
                                    trial_executor)
    else:
        logger.info("Starting a new experiment.")

    if not runner:
        if scheduler is None:
            scheduler = FIFOScheduler()

        if search_alg is None:
            search_alg = BasicVariantGenerator()

        search_alg.add_configurations(experiments)

        runner = TrialRunner(
            search_alg,
            scheduler=scheduler,
            metadata_checkpoint_dir=checkpoint_dir,
            launch_web_server=with_server,
            server_port=server_port,
            verbose=verbose,
            queue_trials=queue_trials,
            trial_executor=trial_executor)

    print(runner.debug_string(max_debug=99999))

    last_debug = 0
    while not runner.is_finished():
        runner.step()
        if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
            print(runner.debug_string())
            last_debug = time.time()

    print(runner.debug_string(max_debug=99999))

    wait_for_log_sync()

    errored_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            errored_trials += [trial]

    if errored_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", errored_trials)
        else:
            logger.error("Trials did not complete: %s", errored_trials)

    return runner.get_trials()
Esempio n. 24
0
 def testConvertExperimentNone(self):
     result = convert_to_experiment_list(None)
     self.assertEqual(len(result), 0)
     self.assertEqual(type(result), list)
Esempio n. 25
0
def run_experiments(experiments,
                    search_alg=None,
                    scheduler=None,
                    with_server=False,
                    server_port=TuneServer.DEFAULT_PORT,
                    verbose=2,
                    resume=False,
                    queue_trials=False,
                    trial_executor=None,
                    raise_on_failed_trial=True):
    """Runs and blocks until all trials finish.

    Args:
        experiments (Experiment | list | dict): Experiments to run. Will be
            passed to `search_alg` via `add_configurations`.
        search_alg (SearchAlgorithm): Search Algorithm. Defaults to
            BasicVariantGenerator.
        scheduler (TrialScheduler): Scheduler for executing
            the experiment. Choose among FIFO (default), MedianStopping,
            AsyncHyperBand, and HyperBand.
        with_server (bool): Starts a background Tune server. Needed for
            using the Client API.
        server_port (int): Port number for launching TuneServer.
        verbose (int): 0, 1, or 2. Verbosity mode. 0 = silent,
            1 = only status updates, 2 = status and trial results.
        resume (bool|"prompt"): If checkpoint exists, the experiment will
            resume from there. If resume is "prompt", Tune will prompt if
            checkpoint detected.
        queue_trials (bool): Whether to queue trials when the cluster does
            not currently have enough resources to launch one. This should
            be set to True when running on an autoscaling cluster to enable
            automatic scale-up.
        trial_executor (TrialExecutor): Manage the execution of trials.
        raise_on_failed_trial (bool): Raise TuneError if there exists failed
            trial (of ERROR state) when the experiments complete.

    Examples:
        >>> experiment_spec = Experiment("experiment", my_func)
        >>> run_experiments(experiments=experiment_spec)

        >>> experiment_spec = {"experiment": {"run": my_func}}
        >>> run_experiments(experiments=experiment_spec)

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     scheduler=MedianStoppingRule(...))

        >>> run_experiments(
        >>>     experiments=experiment_spec,
        >>>     search_alg=SearchAlgorithm(),
        >>>     scheduler=MedianStoppingRule(...))

    Returns:
        List of Trial objects, holding data for each executed trial.

    """
    # This is important to do this here
    # because it schematize the experiments
    # and it conducts the implicit registration.
    experiments = convert_to_experiment_list(experiments)
    checkpoint_dir = _find_checkpoint_dir(experiments)

    runner = None
    restore = False

    if TrialRunner.checkpoint_exists(checkpoint_dir):
        if resume == "prompt":
            msg = ("Found incomplete experiment at {}. "
                   "Would you like to resume it?".format(checkpoint_dir))
            restore = click.confirm(msg, default=False)
            if restore:
                logger.info("Tip: to always resume, "
                            "pass resume=True to run_experiments()")
            else:
                logger.info("Tip: to always start a new experiment, "
                            "pass resume=False to run_experiments()")
        elif resume:
            restore = True
        else:
            logger.info(
                "Tip: to resume incomplete experiments, "
                "pass resume='prompt' or resume=True to run_experiments()")
    else:
        logger.info(
            "Did not find checkpoint file in {}.".format(checkpoint_dir))

    if restore:
        runner = try_restore_runner(checkpoint_dir, search_alg, scheduler,
                                    trial_executor)
    else:
        logger.info("Starting a new experiment.")

    if not runner:
        if scheduler is None:
            scheduler = FIFOScheduler()

        if search_alg is None:
            search_alg = BasicVariantGenerator()

        search_alg.add_configurations(experiments)

        runner = TrialRunner(search_alg,
                             scheduler=scheduler,
                             metadata_checkpoint_dir=checkpoint_dir,
                             launch_web_server=with_server,
                             server_port=server_port,
                             verbose=bool(verbose > 1),
                             queue_trials=queue_trials,
                             trial_executor=trial_executor)

    if verbose:
        print(runner.debug_string(max_debug=99999))

    last_debug = 0
    while not runner.is_finished():
        runner.step()
        if time.time() - last_debug > DEBUG_PRINT_INTERVAL:
            if verbose:
                print(runner.debug_string())
            last_debug = time.time()

    if verbose:
        print(runner.debug_string(max_debug=99999))

    wait_for_log_sync()

    errored_trials = []
    for trial in runner.get_trials():
        if trial.status != Trial.TERMINATED:
            errored_trials += [trial]

    if errored_trials:
        if raise_on_failed_trial:
            raise TuneError("Trials did not complete", errored_trials)
        else:
            logger.error("Trials did not complete: %s", errored_trials)

    return runner.get_trials()
Esempio n. 26
0
 def add_configurations(self, experiments):
     self.experiment_list = convert_to_experiment_list(experiments)
Esempio n. 27
0
    def search(self):

        logger = logging.getLogger("detectron2.trainer")
        logger.info("Step2: search best policies")

        name = "search_fold%d" % (self.k_th)

        # regist function
        register_trainable(
            name,
            lambda augs, rpt: search_func(self.model, self.K_fold, augs, rpt))

        # search algorithm
        algo = HyperOptSearch(
            self.space,
            max_concurrent=4 * 20,
            metric=self.metric,
            mode=self.mode)  # max top1_valid or min minus_loss

        # configuration
        exp_config = {
            name: {
                'run': name,
                'num_samples': 40 if self.smoke_test else self.num_search,
                "resources_per_trial": self.resources_per_trial,
                'stop': {
                    'training_iteration': self.num_policy
                },
                'config': {
                    "cfg": self.cfg,
                    "k_th": self.k_th,
                    "K_fold": self.K_fold,
                    "num_policy": self.num_policy,
                    "num_op": self.num_op,
                    "ops_list": self.ops_list
                }
            }
        }

        # bayes optimization search
        # results = run_experiments(exp_config, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, raise_on_failed_trial=False)
        results = run(
            convert_to_experiment_list(exp_config),
            name=name,
            search_alg=algo,
            resources_per_trial=self.resources_per_trial,
            return_trials=True,
            verbose=0,
            queue_trials=True,
            raise_on_failed_trial=False,
        )

        # sort
        results = [x for x in results if x.last_result is not None]
        results = sorted(results,
                         key=lambda x: x.last_result[self.metric],
                         reverse=True)

        # get top N policies
        final_policy_set = []
        for result in results[:self.num_final_policies]:
            # for result in results[:self.num_final_policies *5//self.K_fold]:
            # transform result to policies
            final_policy = policy_decoder(result.config, self.num_policy,
                                          self.num_op, self.ops_list)
            logger.info('k_th:%d | loss=%.12f top1_valid=%.4f %s' %
                        (self.k_th, result.last_result['minus_loss'],
                         result.last_result['top1_valid'], final_policy))

            final_policy = self._remove_deplicates(final_policy)
            final_policy_set.extend(final_policy)

        return final_policy_set