Esempio n. 1
0
 def __init__(self, service_id: str, meta_store: MetaStore = None):
     self.sub_train_job_id = None
     self.model_class = None
     self.train_dataset_path = None
     self.val_dataset_path = None
     self.train_args = None
     self._meta_store = meta_store or MetaStore()
     self._service_id = service_id
     self._data_store = FileDataStore()
Esempio n. 2
0
 def __init__(self,
              meta_store=None,
              container_manager=None,
              data_store=None,
              param_store=None):
     self._meta_store = meta_store or MetaStore()
     if os.getenv('CONTAINER_MODE', 'SWARM') == 'SWARM':
         container_manager = container_manager or DockerSwarmContainerManager(
         )
     else:
         container_manager = container_manager or KubernetesContainerManager(
         )
     self._data_store: DataStore = data_store or FileDataStore()
     self._param_store: ParamStore = param_store or FileParamStore()
     self._base_worker_image = '{}:{}'.format(
         os.environ['SINGA_AUTO_IMAGE_WORKER'],
         os.environ['SINGA_AUTO_VERSION'])
     self._services_manager = ServicesManager(self._meta_store,
                                              container_manager)
Esempio n. 3
0
class _SubTrainJobMonitor:
    '''
        Manages fetching & updating of metadata & datasets
    '''

    def __init__(self, service_id: str, meta_store: MetaStore = None):
        self.sub_train_job_id = None
        self.model_class = None
        self.train_dataset_path = None
        self.val_dataset_path = None
        self.train_args = None
        self._meta_store = meta_store or MetaStore()
        self._service_id = service_id
        self._data_store = FileDataStore()

    def pull_job_info(self):
        service_id = self._service_id

        logger.info('Reading job info from meta store...')
        with self._meta_store:
            worker = self._meta_store.get_train_job_worker(service_id)
            if worker is None:
                raise InvalidWorkerError(
                    'No such worker "{}"'.format(service_id))

            sub_train_job = self._meta_store.get_sub_train_job(
                worker.sub_train_job_id)
            if sub_train_job is None:
                raise InvalidWorkerError(
                    'No such sub train job associated with advisor "{}"'.format(
                        service_id))

            train_job = self._meta_store.get_train_job(
                sub_train_job.train_job_id)
            if train_job is None:
                raise InvalidWorkerError(
                    'No such train job with ID "{}"'.format(
                        sub_train_job.train_job_id))

            model = self._meta_store.get_model(sub_train_job.model_id)
            if model is None:
                raise InvalidWorkerError('No such model with ID "{}"'.format(
                    sub_train_job.model_id))
            logger.info(f'Using model "{model.name}"...')

            (self.train_dataset_path,
             self.val_dataset_path) = self._load_datasets(train_job)
            self.train_args = train_job.train_args
            self.sub_train_job_id = sub_train_job.id
            self.model_class = load_model_class(model.model_file_bytes,
                                                model.model_class)

    def mark_trial_as_errored(self, trial_id):
        logger.info('Marking trial as errored in store...')
        with self._meta_store:
            trial = self._meta_store.get_trial(trial_id)
            self._meta_store.mark_trial_as_errored(trial)

    def mark_trial_as_running(self, trial_id, proposal):
        logger.info('Marking trial as running in store...')
        with self._meta_store:
            trial = self._meta_store.get_trial(trial_id)
            self._meta_store.mark_trial_as_running(trial,
                                                   proposal.to_jsonable())

    def mark_trial_as_completed(self, trial_id, score, store_params_id):
        logger.info('Marking trial as completed in store...')
        with self._meta_store:
            trial = self._meta_store.get_trial(trial_id)
            self._meta_store.mark_trial_as_completed(trial, score,
                                                     store_params_id)

    def log_to_trial(self, trial_id, log_line, log_lvl):
        with self._meta_store:
            trial = self._meta_store.get_trial(trial_id)
            self._meta_store.add_trial_log(trial, log_line, log_lvl)

    def _load_datasets(self, train_job):
        try:
            train_dataset = self._meta_store.get_dataset(
                train_job.train_dataset_id)
            assert train_dataset is not None
            val_dataset = self._meta_store.get_dataset(train_job.val_dataset_id)
            assert val_dataset is not None
            train_dataset_path = self._data_store.load(
                train_dataset.store_dataset_id)
            val_dataset_path = self._data_store.load(
                val_dataset.store_dataset_id)
            assert train_dataset_path is not None and val_dataset_path is not None
        except Exception as e:
            raise InvalidDatasetError(e)

        return (train_dataset_path, val_dataset_path)