def _pull_job_info(self): service_id = self._service_id logger.info('Reading job info from meta store...') with self._meta_store: worker = self._meta_store.get_inference_job_worker(service_id) if worker is None: raise InvalidWorkerError( 'No such worker "{}"'.format(service_id)) inference_job = self._meta_store.get_inference_job( worker.inference_job_id) if inference_job is None: raise InvalidWorkerError( 'No such inference job with ID "{}"'.format( worker.inference_job_id)) trial = self._meta_store.get_trial(worker.trial_id) if trial is None or trial.store_params_id is None: # Must have model saved raise InvalidTrialError('No saved trial with ID "{}"'.format( worker.trial_id)) logger.info(f'Using trial "{trial.id}"...') model = self._meta_store.get_model(trial.model_id) if model is None: raise InvalidTrialError('No such model with ID "{}"'.format( trial.model_id)) logger.info(f'Using model "{model.name}"...') self._inference_job_id = inference_job.id self._py_model_class = load_model_class(model.model_file_bytes, model.model_class) self._proposal = Proposal.from_jsonable(trial.proposal) self._store_params_id = trial.store_params_id
def pull_job_info(self): service_id = self._service_id logger.info('Reading job info from meta store...') with self._meta_store: worker = self._meta_store.get_train_job_worker(service_id) if worker is None: raise InvalidWorkerError('No such worker "{}"'.format(service_id)) sub_train_job = self._meta_store.get_sub_train_job(worker.sub_train_job_id) if sub_train_job is None: raise InvalidWorkerError('No such sub train job associated with advisor "{}"'.format(service_id)) train_job = self._meta_store.get_train_job(sub_train_job.train_job_id) if train_job is None: raise InvalidWorkerError('No such train job with ID "{}"'.format(sub_train_job.train_job_id)) model = self._meta_store.get_model(sub_train_job.model_id) if model is None: raise InvalidWorkerError('No such model with ID "{}"'.format(sub_train_job.model_id)) logger.info(f'Using model "{model.name}"...') (self.train_dataset_path, self.val_dataset_path) = self._load_datasets(train_job) self.train_args = train_job.train_args self.sub_train_job_id = sub_train_job.id self.model_class = load_model_class(model.model_file_bytes, model.model_class)
def pull_job_info(self): service_id = self._service_id logger.info('Reading job info from meta store...') with self._meta_store: sub_train_job = self._meta_store.get_sub_train_job_by_advisor(service_id) if sub_train_job is None: raise InvalidSubTrainJobError('No sub train job associated with advisor "{}"'.format(service_id)) train_job = self._meta_store.get_train_job(sub_train_job.train_job_id) if train_job is None: raise InvalidSubTrainJobError('No such train job with ID "{}"'.format(sub_train_job.train_job_id)) model = self._meta_store.get_model(sub_train_job.model_id) if model is None: raise InvalidSubTrainJobError('No such model with ID "{}"'.format(sub_train_job.model_id)) logger.info(f'Using model "{model.name}"...') logger.info(f'Using budget "{train_job.budget}"...') trials = self._meta_store.get_trials_of_sub_train_job(sub_train_job.id) self.sub_train_job_id = sub_train_job.id self.budget = train_job.budget self.model_class = load_model_class(model.model_file_bytes, model.model_class) self._num_trials = len(trials) self._model_id = model.id
def _load_model(self, trial_id): trial = self._db.get_trial(trial_id) model = self._db.get_model(trial.model_id) # Load model based on trial clazz = load_model_class(model.model_file_bytes, model.model_class) model_inst = clazz() model_inst.init(trial.knobs) # Unpickle model parameters and load it parameters = pickle.loads(trial.parameters) model_inst.load_parameters(parameters) return model_inst
def _load_model(self, trial_id): trial = self._db.get_trial(trial_id) sub_train_job = self._db.get_sub_train_job(trial.sub_train_job_id) model = self._db.get_model(sub_train_job.model_id) # Load model based on trial clazz = load_model_class(model.model_file_bytes, model.model_class) model_inst = clazz(**trial.knobs) # Unpickle model parameters and load it with open(trial.params_file_path, 'rb') as f: parameters = f.read() parameters = pickle.loads(parameters) model_inst.load_parameters(parameters) return model_inst
def start(self): logger.info('Starting train worker for service of ID "{}"...' \ .format(self._service_id)) # TODO: Break up crazily long & unreadable method advisor_id = None while True: with self._db: (sub_train_job_id, budget, model_id, model_file_bytes, model_class, \ train_job_id, train_dataset_uri, test_dataset_uri) = self._read_worker_info() if self._if_budget_reached(budget, sub_train_job_id): # If budget reached logger.info('Budget for train job has reached') self._stop_worker() if advisor_id is not None: self._delete_advisor(advisor_id) break # Create a new trial logger.info('Creating new trial in DB...') trial = self._db.create_trial( sub_train_job_id=sub_train_job_id, model_id=model_id) self._db.commit() self._trial_id = trial.id logger.info('Created trial of ID "{}" in DB'.format( self._trial_id)) # Don't keep DB connection while training model # Perform trial & record results score = 0 try: logger.info('Starting trial...') # Load model class from bytes logger.info('Loading model class...') clazz = load_model_class(model_file_bytes, model_class) # If not created, create a Rafiki advisor for train worker to propose knobs in trials if advisor_id is None: logger.info('Creating Rafiki advisor...') advisor_id = self._create_advisor(clazz) logger.info( 'Created advisor of ID "{}"'.format(advisor_id)) # Generate knobs for trial logger.info('Requesting for knobs proposal from advisor...') knobs = self._get_proposal_from_advisor(advisor_id) logger.info('Received proposal of knobs from advisor:') logger.info(pprint.pformat(knobs)) # Mark trial as running in DB logger.info('Training & evaluating model...') with self._db: trial = self._db.get_trial(self._trial_id) self._db.mark_trial_as_running(trial, knobs) def handle_log(log_line, log_lvl): with self._db: trial = self._db.get_trial(self._trial_id) self._db.add_trial_log(trial, log_line, log_lvl) (score, parameters) = self._train_and_evaluate_model( clazz, knobs, train_dataset_uri, test_dataset_uri, handle_log) logger.info('Trial score: {}'.format(score)) with self._db: logger.info('Marking trial as complete in DB...') trial = self._db.get_trial(self._trial_id) self._db.mark_trial_as_complete(trial, score, parameters) self._trial_id = None # Report results of trial to advisor try: logger.info( 'Sending result of trials\' knobs to advisor...') self._feedback_to_advisor(advisor_id, knobs, score) except Exception: logger.error( 'Error while sending result of proposal to advisor:') logger.error(traceback.format_exc()) except Exception: logger.error('Error while running trial:') logger.error(traceback.format_exc()) logger.info('Marking trial as errored in DB...') with self._db: trial = self._db.get_trial(self._trial_id) self._db.mark_trial_as_errored(trial) self._trial_id = None break # Exit worker upon trial error
def start(self): logger.info('Starting train worker for service of ID "{}"...' \ .format(self._service_id)) advisor_id = None while True: self._db.connect() (budget, model_id, model_file_bytes, model_class, train_job_id, train_dataset_uri, test_dataset_uri) = self._read_worker_info() if self._if_budget_reached(budget, train_job_id, model_id): # If budget reached logger.info('Budget for train job has reached') self._stop_worker() if advisor_id is not None: self._delete_advisor(advisor_id) break # Load model class from bytes try: clazz = load_model_class(model_file_bytes, model_class) except Exception as e: logger.error('Error while loading model class for worker:') logger.error(traceback.format_exc()) self._stop_worker() raise e # If not created, create a Rafiki advisor for train worker to propose knobs in trials if advisor_id is None: logger.info('Creating Rafiki advisor...') try: advisor_id = self._create_advisor(clazz) logger.info( 'Created advisor of ID "{}"'.format(advisor_id)) except Exception as e: logger.error('Error while creating advisor for worker:') logger.error(traceback.format_exc()) raise e # Create a new trial logger.info('Starting trial...') logger.info('Requesting for knobs proposal from advisor...') knobs = self._get_proposal_from_advisor(advisor_id) logger.info('Received proposal of knobs from advisor:') logger.info(pprint.pformat(knobs)) logger.info('Creating new trial in DB...') trial = self._create_new_trial(model_id, train_job_id, knobs) self._trial_id = trial.id logger.info('Created trial of ID "{}" in DB'.format(trial.id)) # Don't keep DB connection while training model self._db.disconnect() # Perform trial & record results score = 0 try: logger.info('Starting trial...') logger.info('Training & evaluating model...') (score, parameters, logs) = self._train_and_evaluate_model( clazz, knobs, train_dataset_uri, test_dataset_uri) logger.info('Trial score: {}'.format(score)) with self._db: logger.info('Marking trial as complete in DB...') trial = self._db.get_trial(self._trial_id) self._db.mark_trial_as_complete(trial, score, parameters, logs) self._trial_id = None except Exception: logger.error('Error while running trial:') logger.error(traceback.format_exc()) logger.info('Marking trial as errored in DB...') with self._db: trial = self._db.get_trial(self._trial_id) self._db.mark_trial_as_errored(trial) self._trial_id = None # Report results of trial to advisor try: logger.info('Sending result of trials\' knobs to advisor...') self._feedback_to_advisor(advisor_id, knobs, score) except Exception: logger.error( 'Error while sending result of proposal to advisor:') logger.error(traceback.format_exc())