Esempi in Python per ResourceManager.num_trials_to_schedule

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: resource_manager

Classe/tipologia: ResourceManager

Metodo/funzione: num_trials_to_schedule

Esempi su hotexamples.com: 1

ResourceManager.num_trials_to_schedule in Python: 1 esempio trovato. Questo è il miglior esempio reale in Python per resource_manager.ResourceManager.num_trials_to_schedule, estratto da progetti open source. Lo puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

ResourceManager(30)

remove_all_jobs_and_tasks(2)

get_width(2)

instance(2)

is_application_finished(2)

is_application_running(2)

get_height(2)

next_application_id(2)

find_kvstore(2)

report(1)

get_updating_kernel(1)

resize(1)

initialize_pharos_tools(1)

insert_new_job(1)

insert_new_tool(1)

requestLevelLoad(1)

remove_tool_by_id(1)

removeAllInvaders(1)

is_path_exist(1)

mark_task_as_finished(1)

remove_all_tools(1)

get_stdout(1)

num_trials_to_schedule(1)

reboot_instances(1)

register_tasks(1)

get_tool_by_id(1)

get_status(1)

get_stderr(1)

get_gpu_access(1)

a(1)

addNewFile(1)

create_dir(1)

create_workers(1)

destroy_workers(1)

getMoney(1)

get_all_metadata(1)

get_all_tools(1)

get_alpha(1)

get_cpu_access(1)

get_instances(1)

_on_enter_frame(1)

get_job_by_id(1)

get_job_info(1)

get_job_list(1)

get_kernel_metadata(1)

get_memory_access(1)

get_next_task(1)

get_output_file(1)

get_rect(1)

get_resource(1)

Esempio n. 1

Mostra file

File: search.py Progetto: richardliaw/gqcnn

class GQCNNSearch(object):
    def __init__(self, analysis_config, train_configs, datasets, split_names, base_models=[], output_dir=None, search_name=None, monitor_cpu=True, monitor_gpu=True, cpu_cores=[], gpu_devices=[]):
        self._analysis_cfg = analysis_config
        
        # create trial output dir if not specified
        if search_name is None:
            search_name = 'gqcnn_hyperparam_search_{}'.format(gen_timestamp())  
        if output_dir is None:
            output_dir = 'models'
        self._trial_output_dir = os.path.join(output_dir, search_name)
        if not os.path.exists(self._trial_output_dir):
            os.makedirs(self._trial_output_dir)

        # set up logger
        self._logger = Logger.get_logger(self.__class__.__name__, log_file=os.path.join(self._trial_output_dir, 'search.log'), global_log_file=True)

        # init resource manager
        self._resource_manager = ResourceManager(TrialConstants.TRIAL_CPU_LOAD, TrialConstants.TRIAL_GPU_LOAD, TrialConstants.TRIAL_GPU_MEM, monitor_cpu=monitor_cpu, monitor_gpu=monitor_gpu, cpu_cores=cpu_cores, gpu_devices=gpu_devices)
        
        # parse train configs and generate individual trial parameters
        if len(base_models) > 0:
            assert len(train_configs) == len(datasets) == len(split_names) == len(base_models), 'Must have equal number of training configs, datasets, split_names, and base models!'
        else:
            assert len(train_configs) == len(datasets) == len(split_names), 'Must have equal number of training configs, datasets, and split_names!'
        self._logger.info('Generating trial parameters...')
        trial_params = gen_trial_params(train_configs, datasets, split_names, base_models=base_models)

        # create pending trial queue
        self._trials_pending_queue = Queue()
        if len(base_models) > 0:
            for trial_name, hyperparam_summary, train_cfg, dataset, base_model, split_name in trial_params:
                self._trials_pending_queue.put(GQCNNFineTuningAndAnalysisTrial(self._analysis_cfg, train_cfg, dataset, base_model, split_name, self._trial_output_dir, trial_name, hyperparam_summary))
        else:
            for trial_name, hyperparam_summary, train_cfg, dataset, split_name in trial_params:
                self._trials_pending_queue.put(GQCNNTrainingAndAnalysisTrial(self._analysis_cfg, train_cfg, dataset, split_name, self._trial_output_dir, trial_name, hyperparam_summary))

        # create containers to hold running, finished, and errored-out trials
        self._trials_running = []
        self._trials_finished = []
        self._trials_errored = []

    def search(self):
        self._logger.info('Beginning hyper-parameter search...')
        done = False
        waiting_for_trial_init = False
        delay_resource_check = False
        last_schedule_attempt_time = -1
        search_start_time = time.time()
        while not done:
            num_trials_pending = self._trials_pending_queue.qsize()
            num_trials_running = len(self._trials_running)
            num_trials_finished = len(self._trials_finished)
            num_trials_errored = len(self._trials_errored)

            self._logger.info('----------------------------------------------------')
            self._logger.info('Num trials pending: {}'.format(num_trials_pending))
            self._logger.info('Num trials running: {}'.format(num_trials_running))
            self._logger.info('Num trials finished: {}'.format(num_trials_finished))
            if num_trials_errored > 0:
                self._logger.info('Num trials errored: {}'.format(num_trials_errored))            

            if num_trials_pending > 0 and not waiting_for_trial_init and (time.time() - last_schedule_attempt_time) > SearchConstants.MIN_TIME_BETWEEN_SCHEDULE_ATTEMPTS:
                self._logger.info('Attempting to schedule more trials...')
                num_trials_to_schedule, gpus_avail = self._resource_manager.num_trials_to_schedule(num_trials_pending)
                self._logger.info('Scheduling {} trials'.format(num_trials_to_schedule))

                if num_trials_to_schedule > 0:
                    # start trials
                    for _, gpu in zip(range(num_trials_to_schedule), gpus_avail):
                        trial = self._trials_pending_queue.get()
                        trial.begin(gpu_avail=gpu, cpu_cores_avail=self._resource_manager.cpu_cores)
                        self._trials_running.append(trial)

                    # block scheduling until trials have started training(this is when we know what resources are still available)
                    waiting_for_trial_init = True
                last_schedule_attempt_time = time.time()

            # check if trials have started training
            if waiting_for_trial_init:
                training_has_started = [trial.training_status == GQCNNTrainingStatus.TRAINING for trial in self._trials_running]
                if all(training_has_started):
                    waiting_for_trial_init = False

            # log trial status
            if len(self._trials_running) > 0:
                self._logger.info(log_trial_status(self._trials_running))

            # check if any trials have finished running or errored-out
            finished_trials_to_move = []
            errored_trials_to_move = []
            for trial in self._trials_running:
                if trial.finished:
                    finished_trials_to_move.append(trial)
                elif trial.errored_out:
                    errored_trials_to_move.append(trial)
            self._trials_finished.extend(finished_trials_to_move)
            self._trials_errored.extend(errored_trials_to_move)
            for trial in finished_trials_to_move:
                self._trials_running.remove(trial)
            for trial in errored_trials_to_move:
                self._trials_running.remove(trial)

            # update stopping criteria and sleep
            done = (num_trials_pending == 0) and (num_trials_running == 0)
            time.sleep(SearchConstants.SEARCH_THREAD_SLEEP)

        self._logger.info('------------------Successful Trials------------------')
        self._logger.info(log_trial_status(self._trials_finished))
        if len(self._trials_errored) > 0:
            self._logger.info('--------------------Failed Trials--------------------')
            self._logger.info(log_trial_status(self._trials_errored))

        self._logger.info('Hyper-parameter search finished in {} seconds.'.format(time.time() - search_start_time))