def _pull_shared_params(proposal: Proposal, param_cache: ParamCache): if proposal.params_type == ParamsType.NONE: return None print('Retrieving shared params from cache...') shared_params = param_cache.retrieve_params(proposal.params_type) return shared_params
def start(self): self._monitor.pull_job_info() self._train_cache = TrainCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._param_cache = ParamCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) logger.info( f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...' ) self._notify_start() while True: proposal = self._fetch_proposal() if proposal is not None: result = self._perform_trial(proposal) self._submit_result(result) time.sleep(LOOP_SLEEP_SECS)
def start(self): self._monitor.pull_job_info() self._train_cache = TrainCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._param_cache = ParamCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._advisor = self._make_advisor() logger.info( f'Starting advisor for sub train job "{self._monitor.sub_train_job_id}"...' ) self._notify_start() while True: self._fetch_results() if not self._make_proposals(): self._notify_budget_reached() break time.sleep(LOOP_SLEEP_SECS)
def _save_model(model_inst: BaseModel, proposal: Proposal, result: TrialResult, param_cache: ParamCache, param_store: ParamStore): if not proposal.to_cache_params and not proposal.to_save_params: return None print('Dumping model parameters...') params = model_inst.dump_parameters() if proposal.to_cache_params: print('Storing shared params in cache...') param_cache.store_params(params, score=result.score, time=datetime.now()) store_params_id = None if proposal.to_save_params: print('Saving shared params...') store_params_id = param_store.save(params) return store_params_id
def start(self): self._monitor.pull_job_info() self._train_cache = TrainCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._param_cache = ParamCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) logger.info( f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...' ) self._notify_start() # if it's distributed training, skip the adviser if "DIST_TRAIN_MODEL" in os.environ and os.environ["DIST_TRAIN_MODEL"] == "DIST": print("Start dist training") logger.info("Start dist training") pro = Proposal(trial_no=1, knobs={}) model_inst = self._load_model(pro) self._train_model(model_inst, pro, None) result = self._evaluate_model(model_inst, pro) # in master process's container # it's master_addr is "localhost" # only master run the save model if os.environ["MASTER_ADDR"] == "localhost": self._save_model(model_inst, pro, result) else: # training as usual while True: logger.info('Fetching proposal....') proposal = self._fetch_proposal() if proposal is not None: result = self._perform_trial(proposal) self._submit_result(result) time.sleep(LOOP_SLEEP_SECS)
class TrainWorker: def __init__(self, service_id, worker_id): self._worker_id = worker_id self._monitor: _SubTrainJobMonitor = _SubTrainJobMonitor(service_id) self._redis_host = os.environ['REDIS_HOST'] self._redis_port = os.environ['REDIS_PORT'] self._param_store: ParamStore = FileParamStore() self._trial_id = None # ID of currently running trial self._train_cache: TrainCache = None self._param_cache: ParamCache = None self._trial_errors = 0 # Consecutive traial errors def start(self): self._monitor.pull_job_info() self._train_cache = TrainCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._param_cache = ParamCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) logger.info( f'Starting worker for sub train job "{self._monitor.sub_train_job_id}"...' ) self._notify_start() while True: proposal = self._fetch_proposal() if proposal is not None: result = self._perform_trial(proposal) self._submit_result(result) time.sleep(LOOP_SLEEP_SECS) def stop(self): self._notify_stop() # If worker is currently running a trial, mark it has errored try: if self._trial_id is not None: self._monitor.mark_trial_as_errored(self._trial_id) except: logger.error('Error marking trial as errored:') logger.error(traceback.format_exc()) # Run model class teardown try: self._monitor.model_class.teardown() except: logger.error('Error tearing down model class:') logger.error(traceback.format_exc()) def _notify_start(self): superadmin_client().send_event( 'train_job_worker_started', sub_train_job_id=self._monitor.sub_train_job_id) self._train_cache.add_worker(self._worker_id) def _fetch_proposal(self): proposal = self._train_cache.get_proposal(self._worker_id) return proposal def _perform_trial(self, proposal: Proposal) -> TrialResult: self._trial_id = proposal.trial_id logger.info( f'Starting trial {self._trial_id} with proposal {proposal}...') try: # Setup logging logger_info = self._start_logging_to_trial( lambda log_line, log_lvl: self._monitor.log_to_trial( self._trial_id, log_line, log_lvl)) self._monitor.mark_trial_as_running(self._trial_id, proposal) shared_params = self._pull_shared_params(proposal) model_inst = self._load_model(proposal) self._train_model(model_inst, proposal, shared_params) result = self._evaluate_model(model_inst, proposal) store_params_id = self._save_model(model_inst, proposal, result) model_inst.destroy() self._monitor.mark_trial_as_completed(self._trial_id, result.score, store_params_id) self._trial_errors = 0 return result except Exception as e: logger.error('Error while running trial:') logger.error(traceback.format_exc()) self._monitor.mark_trial_as_errored(self._trial_id) # Ensure that trial doesn't error too many times consecutively self._trial_errors += 1 if self._trial_errors > MAX_CONSEC_TRIAL_ERRORS: logger.error( f'Reached {MAX_CONSEC_TRIAL_ERRORS} consecutive errors - raising exception' ) raise e return TrialResult(proposal) finally: self._stop_logging_to_trial(logger_info) # Untie from done trial self._trial_id = None def _notify_stop(self): self._train_cache.delete_worker(self._worker_id) superadmin_client().send_event( 'train_job_worker_stopped', sub_train_job_id=self._monitor.sub_train_job_id) def _start_logging_to_trial(self, handle_log): # Add log handlers for trial, including adding handler to root logger # to capture any logs emitted with level above INFO during model training & evaluation log_handler = LoggerUtilsHandler(handle_log) py_model_logger = logging.getLogger('{}.trial'.format(__name__)) py_model_logger.setLevel(logging.INFO) py_model_logger.propagate = False # Avoid duplicate logs in root logger py_model_logger.addHandler(log_handler) model_logger.set_logger(py_model_logger) root_logger = logging.getLogger() root_logger.addHandler(log_handler) return (root_logger, py_model_logger, log_handler) def _load_model(self, proposal: Proposal): logger.info('Creating model instance...') py_model_class = self._monitor.model_class model_inst = py_model_class(**proposal.knobs) return model_inst def _pull_shared_params(self, proposal: Proposal): if proposal.params_type == ParamsType.NONE: return None logger.info('Retrieving shared params from cache...') shared_params = self._param_cache.retrieve_params(proposal.params_type) return shared_params def _train_model(self, model_inst: BaseModel, proposal: Proposal, shared_params: Union[dict, None]): train_dataset_path = self._monitor.train_dataset_path train_args = self._monitor.train_args logger.info('Training model...') model_inst.train(train_dataset_path, shared_params=shared_params, **(train_args or {})) def _evaluate_model(self, model_inst: BaseModel, proposal: Proposal) -> TrialResult: val_dataset_path = self._monitor.val_dataset_path if not proposal.to_eval: return TrialResult(proposal) logger.info('Evaluating model...') score = model_inst.evaluate(val_dataset_path) logger.info(f'Score on validation dataset: {score}') return TrialResult(proposal, score=score) def _save_model(self, model_inst: BaseModel, proposal: Proposal, result: TrialResult): if not proposal.to_cache_params and not proposal.to_save_params: return None logger.info('Dumping model parameters...') params = model_inst.dump_parameters() if proposal.to_cache_params: logger.info('Storing shared params in cache...') self._param_cache.store_params(params, score=result.score, time=datetime.now()) store_params_id = None if proposal.to_save_params: logger.info('Saving shared params...') store_params_id = self._param_store.save(params) return store_params_id def _submit_result(self, result: TrialResult): self._train_cache.create_result(self._worker_id, result) self._train_cache.delete_proposal(self._worker_id) def _stop_logging_to_trial(self, logger_info): (root_logger, py_model_logger, log_handler) = logger_info # Remove log handlers from loggers for this trial root_logger.removeHandler(log_handler) py_model_logger.removeHandler(log_handler)
class AdvisorWorker: def __init__(self, service_id): self._monitor: _SubTrainJobMonitor = _SubTrainJobMonitor(service_id) self._redis_host = os.environ['REDIS_HOST'] self._redis_port = os.environ['REDIS_PORT'] self._train_cache: TrainCache = None self._param_cache: ParamCache = None self._advisor: BaseAdvisor = None self._worker_infos: Dict[str, _WorkerInfo] = { } # { <worker_id> : <info about worker> } def start(self): self._monitor.pull_job_info() self._train_cache = TrainCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._param_cache = ParamCache(self._monitor.sub_train_job_id, self._redis_host, self._redis_port) self._advisor = self._make_advisor() logger.info( f'Starting advisor for sub train job "{self._monitor.sub_train_job_id}"...' ) self._notify_start() while True: self._fetch_results() if not self._make_proposals(): self._notify_budget_reached() break time.sleep(LOOP_SLEEP_SECS) def stop(self): self._notify_stop() # Clear caches for sub train job try: self._train_cache.clear_all() except: logger.error('Error clearing train cache:') logger.error(traceback.format_exc()) try: self._param_cache.clear_all_params() except: logger.error('Error clearing params cache:') logger.error(traceback.format_exc()) def _notify_start(self): superadmin_client().send_event( 'sub_train_job_advisor_started', sub_train_job_id=self._monitor.sub_train_job_id) def _make_advisor(self): clazz = self._monitor.model_class budget = self._monitor.budget # Retrieve knob config knob_config = clazz.get_knob_config() advisor = make_advisor(knob_config, budget) logger.info(f'Using advisor "{type(advisor).__name__}"...') return advisor # Fetch results of workers def _fetch_results(self): for (worker_id, info) in self._worker_infos.items(): # If no pending trial, skip if info.trial_id is None: continue # Fetch result for worker # If no result yet, skip result = self._train_cache.take_result(worker_id) if result is None: continue # Pass result to advisor self._advisor.feedback(worker_id, result) # Mark worker as not pending info.trial_id = None # Make proposals for workers # Returns False if tuning is to be stopped def _make_proposals(self): # For each free worker worker_ids = self._train_cache.get_workers() for worker_id in worker_ids: # If new worker, add info if worker_id not in self._worker_infos: self._worker_infos[worker_id] = _WorkerInfo() # Get info for worker worker_info = self._worker_infos[worker_id] # Check that worker doesn't already have a proposal proposal = self._train_cache.get_proposal(worker_id) if proposal is not None: continue # Create trial (trial_no, trial_id) = self._monitor.create_next_trial(worker_id) # Make proposal to free worker proposal = self._advisor.propose(worker_id, trial_no) # If advisor has no more proposals, to stop tuning if proposal is None: return False # Attach trial ID to proposal proposal.trial_id = trial_id # Push proposal to worker self._train_cache.create_proposal(worker_id, proposal) # Associate trial ID to worker worker_info.trial_id = trial_id return True def _notify_budget_reached(self): superadmin_client().send_event( 'sub_train_job_budget_reached', sub_train_job_id=self._monitor.sub_train_job_id) def _notify_stop(self): superadmin_client().send_event( 'sub_train_job_advisor_stopped', sub_train_job_id=self._monitor.sub_train_job_id)
def tune_model( py_model_class: Type[BaseModel], train_dataset_path: str, val_dataset_path: str, annotation_dataset_path: str = None, task: str = None, test_dataset_path: str = None, budget: Budget = None, train_args: Dict[str, any] = None) -> (Dict[str, Any], float, Params): worker_id = 'local' # Note start time start_time = time.time() # Retrieve config of model _print_header('Checking model configuration...') knob_config = py_model_class.get_knob_config() _check_knob_config(knob_config) # Read knob values from CLI args _print_header('Starting trials...') knobs_from_args = _maybe_read_knobs_from_args(knob_config) # Read budget options from CLI args budget_from_args = _maybe_read_budget_from_args() budget = {**(budget or {}), **budget_from_args} inform_user(f'Using budget {budget}...') # Make advisor advisor = make_advisor(knob_config, budget) inform_user(f'Using advisor "{type(advisor).__name__}"...') # Create caches & stores param_store: ParamStore = FileParamStore() param_cache: ParamCache = ParamCache() train_cache: TrainCache = TrainCache() # Variables to track over trials best_model_score = -1 best_trial_no = 0 best_model_test_score = None best_proposal = None best_store_params_id = None # Train worker tells advisor that it is free train_cache.add_worker(worker_id) # Until there's no more proposals, keep conducting trials trial_no = 0 while True: trial_no += 1 # Advisor checks free workers worker_ids = train_cache.get_workers() assert worker_id in worker_ids # Advisor checks worker doesn't already have a proposal proposal = train_cache.get_proposal(worker_id) assert proposal is None # Advisor sends a proposal to worker # Overriding knobs from args proposal = advisor.propose(worker_id, trial_no) if proposal is None: print('No more proposals from advisor - to stop training') break proposal.knobs = {**proposal.knobs, **knobs_from_args} train_cache.create_proposal(worker_id, proposal) # Worker receives proposal proposal = train_cache.get_proposal(worker_id) assert proposal is not None # Worker starts trial _print_header(f'Trial #{trial_no}') print('Proposal from advisor:', proposal) # Worker loads model model_inst = py_model_class(**proposal.knobs) # Worker pulls shared params shared_params = _pull_shared_params(proposal, param_cache) # Worker trains model print('Training model...') if annotation_dataset_path: model_inst.train(train_dataset_path, annotation_dataset_path=annotation_dataset_path, shared_params=shared_params, **(train_args or {})) else: model_inst.train(train_dataset_path, shared_params=shared_params, **(train_args or {})) # Worker evaluates model if annotation_dataset_path: result = _evaluate_model(model_inst, proposal, val_dataset_path, annotation_dataset_path) else: result = _evaluate_model(model_inst, proposal, val_dataset_path) # Worker caches/saves model parameters store_params_id = _save_model(model_inst, proposal, result, param_cache, param_store) # Update best saved model if result.score is not None and store_params_id is not None and result.score > best_model_score: inform_user( 'Best saved model so far! Beats previous best of score {}!'. format(best_model_score)) best_store_params_id = store_params_id best_proposal = proposal best_model_score = result.score best_trial_no = trial_no # Test best model, if test dataset provided if test_dataset_path is not None: print('Evaluating new best model on test dataset...') if annotation_dataset_path: best_model_test_score = model_inst.evaluate( test_dataset_path, annotation_dataset_path=annotation_dataset_path) else: best_model_test_score = model_inst.evaluate( test_dataset_path) inform_user( 'Score on test dataset: {}'.format(best_model_test_score)) # Worker sends result to advisor print('Giving feedback to advisor...') train_cache.create_result(worker_id, result) train_cache.delete_proposal(worker_id) # Advisor receives result # Advisor ingests feedback result = train_cache.take_result(worker_id) assert result is not None advisor.feedback(worker_id, result) # Destroy model model_inst.destroy() if task == 'question_answering_covid19': break # Train worker tells advisor that it is no longer free train_cache.delete_worker(worker_id) # Declare best model if best_proposal is not None: inform_user('Best trial #{} has knobs {} with score of {}'.format( best_trial_no, best_proposal.knobs, best_model_score)) if best_model_test_score is not None: inform_user( '...with test score of {}'.format(best_model_test_score)) # Load params for best model best_params = None if best_store_params_id is not None: best_params = param_store.load(best_store_params_id) # Teardown model class print('Running model class teardown...') py_model_class.teardown() # Print duration duration = time.time() - start_time print('Tuning took a total of {}s'.format(duration)) return (best_proposal, best_model_test_score, best_params)