def superadmin_client(): from rafiki.client import Client admin_host = os.environ['ADMIN_HOST'] admin_port = os.environ['ADMIN_PORT'] client = Client(admin_host=admin_host, admin_port=admin_port) client.login(email=SUPERADMIN_EMAIL, password=os.environ['SUPERADMIN_PASSWORD']) return client
def make_user(user_type, email=None, password=None): email = email or gen_email() password = password or gen() client = Client() client.login(superadmin_email, superadmin_password) client.create_user(email, password, user_type) client.login(email, password) return client
def _make_client(self): admin_host = os.environ['ADMIN_HOST'] admin_port = os.environ['ADMIN_PORT'] advisor_host = os.environ['ADVISOR_HOST'] advisor_port = os.environ['ADVISOR_PORT'] superadmin_email = SUPERADMIN_EMAIL superadmin_password = SUPERADMIN_PASSWORD client = Client(admin_host=admin_host, admin_port=admin_port, advisor_host=advisor_host, advisor_port=advisor_port) client.login(email=superadmin_email, password=superadmin_password) return client
], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]]] client = Client(admin_host=RAFIKI_HOST, admin_port=ADMIN_PORT) client.login(email=SUPERADMIN_EMAIL, password=USER_PASSWORD) print('Creating model developer in Rafiki...') create_user(client, MODEL_DEVELOPER_EMAIL, USER_PASSWORD, UserType.MODEL_DEVELOPER) print('Creating app developer in Rafiki...') create_user(client, APP_DEVELOPER_EMAIL, USER_PASSWORD, UserType.APP_DEVELOPER) print('Logging in as model developer...') client.login(email=MODEL_DEVELOPER_EMAIL, password=USER_PASSWORD) print('Adding models to Rafiki...') create_model(client, 'TfFeedForward', task, 'examples/models/image_classification/TfFeedForward.py', \ 'TfFeedForward', dependencies={ ModelDependency.TENSORFLOW: '1.12.0' })
import pprint import os from rafiki.client import Client from rafiki.config import SUPERADMIN_EMAIL, SUPERADMIN_PASSWORD def seed_users(client): users = client.create_users('examples/seeds/users.csv') pprint.pprint(users) if __name__ == '__main__': rafiki_host = os.environ.get('RAFIKI_HOST', 'localhost') admin_port = int(os.environ.get('ADMIN_EXT_PORT', 3000)) admin_web_port = int(os.environ.get('ADMIN_WEB_EXT_PORT', 3001)) user_email = os.environ.get('USER_EMAIL', SUPERADMIN_EMAIL) user_password = os.environ.get('USER_PASSWORD', SUPERADMIN_PASSWORD) # Initialize client client = Client(admin_host=rafiki_host, admin_port=admin_port) client.login(email=user_email, password=user_password) seed_users(client)
dependencies={ModelDependency.TENSORFLOW: '1.12.0'} ) pprint(model) print('Creating train job...') budget = { BudgetOption.TIME_HOURS: hours, BudgetOption.GPU_COUNT: gpus } train_job = client.create_train_job(app, task, train_dataset['id'], val_dataset['id'], budget, models=[model['id']]) pprint(train_job) print('Monitor the train job on Rafiki Web Admin') # TODO: Evaluate on test dataset? if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--email', type=str, default=SUPERADMIN_EMAIL, help='Email of user') parser.add_argument('--password', type=str, default=os.environ.get('SUPERADMIN_PASSWORD'), help='Password of user') parser.add_argument('--gpus', type=int, default=0, help='How many GPUs to use') parser.add_argument('--hours', type=float, default=24, help='How long the train job should run for (in hours)') out_train_dataset_path = 'data/cifar10_train.zip' out_val_dataset_path = 'data/cifar10_val.zip' (args, _) = parser.parse_known_args() # Initialize client client = Client() client.login(email=args.email, password=args.password) run_enas(client, out_train_dataset_path, out_val_dataset_path, args.gpus, args.hours)
class TrainWorker(object): def __init__(self, service_id, worker_id, db=None): if db is None: db = Database() self._service_id = service_id self._db = db self._worker_id = worker_id self._trial_id = None self._sub_train_job_id = None self._client = Client(admin_host=os.environ['ADMIN_HOST'], admin_port=os.environ['ADMIN_PORT'], advisor_host=os.environ['ADVISOR_HOST'], advisor_port=os.environ['ADVISOR_PORT']) self._params_root_dir = os.path.join(os.environ['WORKDIR_PATH'], os.environ['PARAMS_DIR_PATH']) def start(self): logger.info('Starting train worker for service of ID "{}"...' \ .format(self._service_id)) # TODO: Break up crazily long & unreadable method advisor_id = None while True: with self._db: (self._sub_train_job_id, budget, model_id, model_file_bytes, model_class, \ train_job_id, train_dataset_uri, test_dataset_uri) = self._read_worker_info() self._get_client().send_event( 'train_job_worker_started', sub_train_job_id=self._sub_train_job_id) if self._if_budget_reached(budget): # If budget reached logger.info('Budget for train job has reached') self._stop_sub_train_job() if advisor_id is not None: self._delete_advisor(advisor_id) break # Create a new trial logger.info('Creating new trial in DB...') trial = self._db.create_trial( sub_train_job_id=self._sub_train_job_id, model_id=model_id, worker_id=self._worker_id) self._db.commit() self._trial_id = trial.id logger.info('Created trial of ID "{}" in DB'.format( self._trial_id)) # Don't keep DB connection while training model # Perform trial & record results score = 0 try: logger.info('Starting trial...') # Load model class from bytes logger.info('Loading model class...') clazz = load_model_class(model_file_bytes, model_class) # If not created, create a Rafiki advisor for train worker to propose knobs in trials if advisor_id is None: logger.info('Creating Rafiki advisor...') advisor_id = self._create_advisor(clazz) logger.info( 'Created advisor of ID "{}"'.format(advisor_id)) # Generate knobs for trial logger.info('Requesting for knobs proposal from advisor...') knobs = self._get_proposal_from_advisor(advisor_id) logger.info('Received proposal of knobs from advisor:') logger.info(pprint.pformat(knobs)) # Mark trial as running in DB logger.info('Training & evaluating model...') with self._db: trial = self._db.get_trial(self._trial_id) self._db.mark_trial_as_running(trial, knobs) def handle_log(log_line, log_lvl): with self._db: trial = self._db.get_trial(self._trial_id) self._db.add_trial_log(trial, log_line, log_lvl) (score, params_file_path) = self._train_and_evaluate_model( clazz, knobs, train_dataset_uri, test_dataset_uri, handle_log) logger.info('Trial score: {}'.format(score)) with self._db: logger.info('Marking trial as complete in DB...') trial = self._db.get_trial(self._trial_id) self._db.mark_trial_as_complete(trial, score, params_file_path) # Report results of trial to advisor try: logger.info( 'Sending result of trials\' knobs to advisor...') self._feedback_to_advisor(advisor_id, knobs, score) except Exception: logger.error( 'Error while sending result of proposal to advisor:') logger.error(traceback.format_exc()) self._trial_id = None except Exception: logger.error('Error while running trial:') logger.error(traceback.format_exc()) logger.info('Marking trial as errored in DB...') with self._db: trial = self._db.get_trial(self._trial_id) self._db.mark_trial_as_errored(trial) self._trial_id = None break # Exit worker upon trial error def stop(self): # If worker is currently running a trial, mark it has terminated logger.info('Marking trial as terminated in DB...') try: if self._trial_id is not None: with self._db: trial = self._db.get_trial(self._trial_id) self._db.mark_trial_as_terminated(trial) except Exception: logger.error('Error marking trial as terminated:') logger.error(traceback.format_exc()) if self._sub_train_job_id is not None: self._get_client().send_event( 'train_job_worker_stopped', sub_train_job_id=self._sub_train_job_id) def _train_and_evaluate_model(self, clazz, knobs, train_dataset_uri, \ test_dataset_uri, handle_log): # Initialize model model_inst = clazz(**knobs) # Add logs handlers for trial, including adding handler to root logger # to handle logs emitted during model training with level above INFO log_handler = ModelLoggerHandler(handle_log) root_logger = logging.getLogger() root_logger.addHandler(log_handler) py_model_logger = logging.getLogger('{}.trial'.format(__name__)) py_model_logger.setLevel(logging.INFO) py_model_logger.propagate = False # Avoid duplicate logs in root logger py_model_logger.addHandler(log_handler) model_logger.set_logger(py_model_logger) # Train model model_inst.train(train_dataset_uri) # Evaluate model score = model_inst.evaluate(test_dataset_uri) # Remove log handlers from loggers for this trial root_logger.removeHandler(log_handler) py_model_logger.removeHandler(log_handler) # Dump and pickle model parameters parameters = model_inst.dump_parameters() parameters = pickle.dumps(parameters) params_file_path = os.path.join(self._params_root_dir, '{}.model'.format(self._trial_id)) with open(params_file_path, 'wb') as f: f.write(parameters) model_inst.destroy() return (score, params_file_path) # Gets proposal of a set of knob values from advisor def _get_proposal_from_advisor(self, advisor_id): res = self._get_client()._generate_proposal(advisor_id) knobs = res['knobs'] return knobs # Feedback result of knobs to advisor def _feedback_to_advisor(self, advisor_id, knobs, score): self._get_client()._feedback_to_advisor(advisor_id, knobs, score) def _stop_sub_train_job(self): logger.warn('Stopping sub train job...') try: self._get_client().send_event( 'sub_train_job_budget_reached', sub_train_job_id=self._sub_train_job_id) except Exception: # Throw just a warning - likely that another worker has stopped it logger.warn('Error while stopping sub train job:') logger.warn(traceback.format_exc()) def _create_advisor(self, clazz): # Retrieve knob config for model of worker knob_config = clazz.get_knob_config() knob_config_str = serialize_knob_config(knob_config) # Create advisor associated with worker res = self._get_client()._create_advisor(knob_config_str, advisor_id=self._service_id) advisor_id = res['id'] return advisor_id # Delete advisor def _delete_advisor(self, advisor_id): try: self._get_client()._delete_advisor(advisor_id) except Exception: # Throw just a warning - not critical for advisor to be deleted logger.warning('Error while deleting advisor:') logger.warning(traceback.format_exc()) # Returns whether the worker reached its budget (only consider COMPLETED or ERRORED trials) def _if_budget_reached(self, budget): # By default, budget is model trial count of 5 max_trials = budget.get(BudgetType.MODEL_TRIAL_COUNT, 5) trials = self._db.get_trials_of_sub_train_job(self._sub_train_job_id) trials = [ x for x in trials if x.status in [TrialStatus.COMPLETED, TrialStatus.ERRORED] ] return len(trials) >= max_trials def _read_worker_info(self): worker = self._db.get_train_job_worker(self._service_id) if worker is None: raise InvalidWorkerException() sub_train_job = self._db.get_sub_train_job(worker.sub_train_job_id) train_job = self._db.get_train_job(sub_train_job.train_job_id) model = self._db.get_model(sub_train_job.model_id) if model is None: raise InvalidModelException() if train_job is None or sub_train_job is None: raise InvalidTrainJobException() return (sub_train_job.id, train_job.budget, model.id, model.model_file_bytes, model.model_class, train_job.id, train_job.train_dataset_uri, train_job.test_dataset_uri) def _get_client(self): self._client.login(email=SUPERADMIN_EMAIL, password=SUPERADMIN_PASSWORD) return self._client
from examples.scripts.client_quickstart import RAFIKI_HOST, ADMIN_PORT, USER_PASSWORD, MODEL_DEVELOPER_EMAIL, \ create_model from rafiki.client import Client from rafiki.constants import TaskType, ModelDependency if __name__ == '__main__': app = 'home_rentals_regression' task = TaskType.TABLE_REGRESSION client = Client(admin_host=RAFIKI_HOST, admin_port=ADMIN_PORT) print('Logging in as model developer...') client.login(email=MODEL_DEVELOPER_EMAIL, password=USER_PASSWORD) print('Adding models to Rafiki...') create_model(client, 'SkLasso', task, 'examples/models/table_regression/SkLasso.py', 'SkLasso', dependencies={ModelDependency.SCIKIT_LEARN: '0.20.0'})
def superadmin(): client = Client() client.login(superadmin_email, superadmin_password) return client
from examples.scripts.client_quickstart import RAFIKI_HOST, ADMIN_PORT, USER_PASSWORD, create_user, \ MODEL_DEVELOPER_EMAIL, APP_DEVELOPER_EMAIL from rafiki.client import Client from rafiki.config import SUPERADMIN_EMAIL from rafiki.constants import UserType if __name__ == '__main__': client = Client(admin_host=RAFIKI_HOST, admin_port=ADMIN_PORT) client.login(email=SUPERADMIN_EMAIL, password=USER_PASSWORD) print('Creating model developer in Rafiki...') create_user(client, MODEL_DEVELOPER_EMAIL, USER_PASSWORD, UserType.MODEL_DEVELOPER) print('Creating app developer in Rafiki...') create_user(client, APP_DEVELOPER_EMAIL, USER_PASSWORD, UserType.APP_DEVELOPER)
res = requests.post(url='http://{}/predict'.format(predictor_host), json={'query': QUERY}) if res.status_code != 200: raise Exception(res.text) pprint.pprint(res.json()) def stop_inference_job(client): pprint.pprint(client.stop_inference_job(app=APP)) if __name__ == '__main__': client = Client(admin_host=ADMIN_HOST, admin_port=ADMIN_PORT) client.login(email=USER_EMAIL, password=USER_PASSWORD) print('Adding models to Rafiki...') create_models(client) print('Creating train job for app "{}" on Rafiki...'.format(APP)) create_train_job(client) print('Waiting for train job to complete...') wait_until_train_job_has_completed(client) print('Train job has been completed!') print( 'Listing best trials of latest train job for app "{}"...'.format(APP)) list_best_trials_of_train_job(client)