def __init__(self, service_id: str, meta_store: MetaStore = None): self.sub_train_job_id = None self.model_class = None self.train_dataset_path = None self.val_dataset_path = None self.train_args = None self._meta_store = meta_store or MetaStore() self._service_id = service_id self._data_store = FileDataStore()
def __init__(self, meta_store=None, container_manager=None, data_store=None, param_store=None): self._meta_store = meta_store or MetaStore() if os.getenv('CONTAINER_MODE', 'SWARM') == 'SWARM': container_manager = container_manager or DockerSwarmContainerManager( ) else: container_manager = container_manager or KubernetesContainerManager( ) self._data_store: DataStore = data_store or FileDataStore() self._param_store: ParamStore = param_store or FileParamStore() self._base_worker_image = '{}:{}'.format( os.environ['SINGA_AUTO_IMAGE_WORKER'], os.environ['SINGA_AUTO_VERSION']) self._services_manager = ServicesManager(self._meta_store, container_manager)
class _SubTrainJobMonitor: ''' Manages fetching & updating of metadata & datasets ''' def __init__(self, service_id: str, meta_store: MetaStore = None): self.sub_train_job_id = None self.model_class = None self.train_dataset_path = None self.val_dataset_path = None self.train_args = None self._meta_store = meta_store or MetaStore() self._service_id = service_id self._data_store = FileDataStore() def pull_job_info(self): service_id = self._service_id logger.info('Reading job info from meta store...') with self._meta_store: worker = self._meta_store.get_train_job_worker(service_id) if worker is None: raise InvalidWorkerError( 'No such worker "{}"'.format(service_id)) sub_train_job = self._meta_store.get_sub_train_job( worker.sub_train_job_id) if sub_train_job is None: raise InvalidWorkerError( 'No such sub train job associated with advisor "{}"'.format( service_id)) train_job = self._meta_store.get_train_job( sub_train_job.train_job_id) if train_job is None: raise InvalidWorkerError( 'No such train job with ID "{}"'.format( sub_train_job.train_job_id)) model = self._meta_store.get_model(sub_train_job.model_id) if model is None: raise InvalidWorkerError('No such model with ID "{}"'.format( sub_train_job.model_id)) logger.info(f'Using model "{model.name}"...') (self.train_dataset_path, self.val_dataset_path) = self._load_datasets(train_job) self.train_args = train_job.train_args self.sub_train_job_id = sub_train_job.id self.model_class = load_model_class(model.model_file_bytes, model.model_class) def mark_trial_as_errored(self, trial_id): logger.info('Marking trial as errored in store...') with self._meta_store: trial = self._meta_store.get_trial(trial_id) self._meta_store.mark_trial_as_errored(trial) def mark_trial_as_running(self, trial_id, proposal): logger.info('Marking trial as running in store...') with self._meta_store: trial = self._meta_store.get_trial(trial_id) self._meta_store.mark_trial_as_running(trial, proposal.to_jsonable()) def mark_trial_as_completed(self, trial_id, score, store_params_id): logger.info('Marking trial as completed in store...') with self._meta_store: trial = self._meta_store.get_trial(trial_id) self._meta_store.mark_trial_as_completed(trial, score, store_params_id) def log_to_trial(self, trial_id, log_line, log_lvl): with self._meta_store: trial = self._meta_store.get_trial(trial_id) self._meta_store.add_trial_log(trial, log_line, log_lvl) def _load_datasets(self, train_job): try: train_dataset = self._meta_store.get_dataset( train_job.train_dataset_id) assert train_dataset is not None val_dataset = self._meta_store.get_dataset(train_job.val_dataset_id) assert val_dataset is not None train_dataset_path = self._data_store.load( train_dataset.store_dataset_id) val_dataset_path = self._data_store.load( val_dataset.store_dataset_id) assert train_dataset_path is not None and val_dataset_path is not None except Exception as e: raise InvalidDatasetError(e) return (train_dataset_path, val_dataset_path)