def test_register_dynamic_profiling_result(): model = ModelService.get_models_by_name('ResNet50')[0] dpr = DynamicProfileResultBO( 'gpu:01', 'Tesla K40c', 1, ProfileMemory(1000, 1000, 1000), ProfileLatency((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)), ProfileThroughput((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1))) assert ModelService.append_dynamic_profiling_result(model.id, dpr)
def test_update_dynamic_profiling_result(): model = ModelService.get_models('ResNet50')[0] dummy_info_tuple = InfoTuple(avg=1, p50=1, p95=1, p99=1) updated_info_tuple = InfoTuple(avg=1, p50=2, p95=1, p99=1) dpr = DynamicProfileResultBO(device_id='gpu:01', device_name='Tesla K40c', batch=1, memory=ProfileMemory(1000, 2000, 0.5), latency=ProfileLatency( init_latency=dummy_info_tuple, preprocess_latency=dummy_info_tuple, inference_latency=updated_info_tuple, postprocess_latency=dummy_info_tuple, ), throughput=ProfileThroughput( batch_formation_throughput=1, preprocess_throughput=1, inference_throughput=1, postprocess_throughput=1, )) # check update assert ModelService.update_dynamic_profiling_result(model.id, dpr) # check result model = ModelService.get_models('ResNet50')[0] assert model.profile_result.dynamic_results[0].memory.memory_usage == 2000 assert model.profile_result.dynamic_results[ 0].latency.inference_latency.p50 == 2
def serve_by_task(args): model_bo = retrieve_model_by_task(task=args.task) serve(model_bo[0].saved_path, device=args.device, name=args.name, batch_size=args.bs) # TODO: check if the service is dispatched sucessfully new_status = [item for item in model_bo[0].model_status if item is not (ModelStatus.CONVERTED or ModelStatus.PUBLISHED)] new_status.append(ModelStatus.IN_SERVICE) model_bo[0].model_status = new_status ModelService.update_model(model_bo[0])
def terminate(self): if self._task: # trigger pytorch lighting training graceful shutdown via a ^C self._task.set_exception(KeyboardInterrupt()) model_train_curd.update(TrainingJobUpdate(_id=self._id, status=Status.FAIL)) model_bo = ModelService.get_model_by_id(self.model_id) model_bo.model_status.remove(ModelStatus.TRAINING) model_bo.model_status.append(ModelStatus.DRAFT) ModelService.update_model(model_bo)
def test_update_dynamic_profiling_result(): model = ModelService.get_models_by_name('ResNet50')[0] dpr = DynamicProfileResultBO( 'gpu:01', 'Tesla K40c', 1, ProfileMemory(1000, 2000, 1000), ProfileLatency((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1)), ProfileThroughput((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1))) # check update assert ModelService.update_dynamic_profiling_result(model.id, dpr) # check result model = ModelService.get_models_by_name('ResNet50')[0] assert model.profile_result.dynamic_results[0].memory.cpu_memory == 2000
def test_update_model(): model = ModelService.get_models('ResNet50')[0] model.acc = 0.9 model.weight.weight = bytes([123, 255]) # check if update success assert ModelService.update_model(model) model_ = ModelService.get_models('ResNet50')[0] # check updated model assert abs(model_.acc - 0.9) < 1e-6 assert model_.weight.weight == model.weight.weight
def serve_by_name(args): model = args.model framework = Framework[args.framework.upper()] engine = Engine[args.engine.upper()] model_bo = retrieve_model(architecture=model, framework=framework, engine=engine) serve(model_bo[0].saved_path, device=args.device, name=args.name, batch_size=args.bs) # TODO: check if the service is dispatched sucessfully new_status = [item for item in model_bo[0].model_status if item is not (ModelStatus.CONVERTED or ModelStatus.PUBLISHED)] new_status.append(ModelStatus.IN_SERVICE) model_bo[0].model_status = new_status ModelService.update_model(model_bo[0])
def start(self): def training_done_callback(future): model_train_curd.update(TrainingJobUpdate(_id=self._id, status=Status.PASS)) # TODO: save to database and update model_status, engine print(self.export_model()) self._task = self._executor.submit(self.trainer_engine.fit, self.model, **self._data_loader_kwargs) self._task.add_done_callback(training_done_callback) model_train_curd.update(TrainingJobUpdate(_id=self._id, status=Status.RUNNING)) model_bo = ModelService.get_model_by_id(self.model_id) model_bo.model_status.remove(ModelStatus.DRAFT) model_bo.model_status.append(ModelStatus.TRAINING) ModelService.update_model(model_bo)
def retrieve_model_by_name(architecture_name: str = 'ResNet50', framework: Framework = None, engine: Engine = None): """Query a model by name, framework or engine. Arguments: architecture_name (str): Model architecture name. framework (Framework): Framework name, optional query key. Default to None. engine (Engine): Model optimization engine name. Returns: ModelBO: Model business object. """ # retrieve models = ModelService.get_models_by_name(architecture_name, framework=framework, engine=engine) # check if found if len(models) == 0: raise FileNotFoundError('Model not found!') # TODO: filter version model = models[0] get_remote_model_weight(model) return model
def retrieve_model( architecture_name: str = 'ResNet50', task: Task = None, framework: Framework = None, engine: Engine = None, version: ModelVersion = None, download: bool = True, ) -> List[ModelBO]: """Query a model by name, task, framework, engine or version. Arguments: architecture_name (str): Model architecture name. task (Task): which machine learn task is model used for,Default to None framework (Framework): Framework name, optional query key. Default to None. engine (Engine): Model optimization engine name. version (ModelVersion): Model version. Default to None. download (bool): Flag for whether the model needs to be cached locally. Returns: List[ModelBO]: A list of model business object. """ # retrieve models = ModelService.get_models(architecture_name, task=task, framework=framework, engine=engine, version=version) # check if found if len(models) != 0 and download: _get_remote_model_weights(models) return models
def test_get_model_by_task(): models = ModelService.get_models_by_task(Task.IMAGE_CLASSIFICATION) # check length assert len(models) == 1 # check name for model in models: assert model.task == Task.IMAGE_CLASSIFICATION
def test_get_model_by_task(): models = ModelService.get_models_by_task('image classification') # check length assert len(models) == 1 # check name for model in models: assert model.task == 'image classification'
def test_get_model_by_name(): models = ModelService.get_models('ResNet50') # check length assert len(models) == 1 # check name for model in models: assert model.name == 'ResNet50'
def test_register_dynamic_profiling_result(): model = ModelService.get_models('ResNet50')[0] dummy_info_tuple = InfoTuple(avg=1, p50=1, p95=1, p99=1) dpr = DynamicProfileResultBO(device_id='gpu:01', device_name='Tesla K40c', batch=1, memory=ProfileMemory(1000, 1000, 0.5), latency=ProfileLatency( init_latency=dummy_info_tuple, preprocess_latency=dummy_info_tuple, inference_latency=dummy_info_tuple, postprocess_latency=dummy_info_tuple, ), throughput=ProfileThroughput( batch_formation_throughput=1, preprocess_throughput=1, inference_throughput=1, postprocess_throughput=1, )) assert ModelService.append_dynamic_profiling_result(model.id, dpr)
def run(self) -> None: from modelci.hub.deployer.dispatcher import serve for job in iter(self.job_queue.get, None): # exit the queue if job is self._queue_finish_flag: break # start a new container if container not started if job.container_name is None: container = serve(save_path=job.model.saved_path, device=job.device) container_name = container.name # remember to clean-up the created container self._hold_container.put(container) else: container_name = job.container_name # change model status job.model.status = Status.RUNNING ModelService.update_model(job.model) profiler = Profiler(model_info=job.model, server_name=container_name, inspector=job.client) dpr = profiler.diagnose(device=job.device) ModelService.append_dynamic_profiling_result(job.model.id, dynamic_result=dpr) # set model status to pass job.model.status = Status.PASS ModelService.update_model(job.model) if job.container_name is None: # get holding container self._hold_container.get().stop()
def get_all_model(name: str = None, framework: Framework_ = None, engine: Engine_ = None, version: int = None): if framework is not None: framework = Framework[framework.value.upper()] if engine is not None: engine = Engine[engine.value.upper()] models = ModelService.get_models(name=name, framework=framework, engine=engine, version=version) return list(map(ModelListOut.from_bo, models))
def from_training_job(cls, training_job: TrainingJob) -> 'PyTorchTrainer': # TODO: only support fine-tune model_bo = ModelService.get_model_by_id(training_job.model) if model_bo.engine != Engine.PYTORCH: raise ValueError( f'Model engine expected `{Engine.PYTORCH}`, but got {model_bo.engine}.' ) # download local cache cache_path = get_remote_model_weight(model_bo) net = torch.load(cache_path) freeze(module=net, n=-1, train_bn=True) # build pytorch lightning module fine_tune_module_kwargs = { 'net': net, 'loss': eval(str(training_job.loss_function))(), # nosec 'batch_size': training_job.data_module.batch_size, 'num_workers': training_job.data_module.num_workers, } if training_job.optimizer_property.lr: fine_tune_module_kwargs['lr'] = training_job.optimizer_property.lr if training_job.lr_scheduler_property.gamma: fine_tune_module_kwargs[ 'lr_scheduler_gamma'] = training_job.lr_scheduler_property.gamma if training_job.lr_scheduler_property.step_size: fine_tune_module_kwargs[ 'step_size'] = training_job.lr_scheduler_property.step_size model = FineTuneModule(**fine_tune_module_kwargs) data_module = PyTorchDataModule(**training_job.data_module.dict( exclude_none=True)) trainer_kwargs = training_job.dict( exclude_none=True, include={'min_epochs', 'max_epochs'}) trainer = cls( id=training_job.id, model=model, data_loader_kwargs={'datamodule': data_module}, trainer_kwargs={ 'default_root_dir': training_job.data_module.data_dir or OUTPUT_DIR, 'weights_summary': None, 'progress_bar_refresh_rate': 1, 'num_sanity_val_steps': 0, 'gpus': 1, # TODO: set GPU number **trainer_kwargs, }) return trainer
def retrieve_model_by_parent_id(parent_id: str) -> List[ModelBO]: """ Query models by specifying the parent model id Args: parent_id (str): : the parent model id of current model if this model is derived from a pre-existing one Returns: List[ModelBO]: A list of model business object. """ models = ModelService.get_models_by_parent_id(parent_id) # check if found if len(models) == 0: raise FileNotFoundError('Model not found!') return models
def generate_model_graph(*, id: str): # noqa model_bo = ModelService.get_model_by_id(id) dot_graph = '' if model_bo.engine == Engine.PYTORCH: pytorch_model = torch.load(model_bo.saved_path) sample_data = torch.zeros(1, *model_bo.inputs[0].shape[1:], dtype=torch.float, requires_grad=False) out = pytorch_model(sample_data) dot_graph = make_dot(out, params=dict( list(pytorch_model.named_parameters()) + [('x', sample_data)])) return {'dot': str(dot_graph)}
def test_register_model(): model = ModelBO('ResNet50', framework=Framework.PYTORCH, engine=Engine.TRT, version=ModelVersion(1), dataset='ImageNet', metric={Metric.ACC: 0.80}, task=Task.IMAGE_CLASSIFICATION, inputs=[ IOShape([-1, 3, 224, 224], dtype=float, format=ModelInputFormat.FORMAT_NCHW) ], outputs=[IOShape([-1, 1000], dtype=int)], weight=Weight(bytes([123]))) assert ModelService.post_model(model)
async def get_model_structure(id: str): # noqa """ Get model structure as a model structure graph (connection between layer as edge, layers as nodes) Arguments: id (str): Model object ID. """ # return model DAG model = ModelService.get_model_by_id(id) if model.engine != Engine.PYTORCH: raise ValueError(f'model {id} is not supported for editing. ' f'Currently only support model with engine=PYTORCH') # download model as local cache cache_path = get_remote_model_weight(model=model) net = torch.load(cache_path) return Structure.from_model(net)
def test_register_model(): model = ModelBO('ResNet50', framework=Framework.PYTORCH, engine=Engine.TRT, version=ModelVersion(1), dataset='ImageNet', acc=0.8, task='image classification', inputs=[ IOShape([-1, 3, 224, 224], dtype=float, format=ModelInputFormat.FORMAT_NCHW) ], outputs=[IOShape([-1, 1000], dtype=int)], weight=Weight(bytes([123]))) assert ModelService.post_model(model)
def retrieve_model_by_task(task='image classification') -> ModelBO: """Query a model by task. This function will download a cache model from the model DB. Arguments: task (str): Task name. Default to "image classification" Returns: ModelBo: Model business object. """ # retrieve models = ModelService.get_models_by_task(task) # check if found if len(models) == 0: raise FileNotFoundError('Model not found!') model = models[0] get_remote_model_weight(model) return model
def retrieve_model_by_task(task: Task) -> List[ModelBO]: """Query a model by task. This function will download a cache model from the model DB. Arguments: task (Task): Task name the model is used for. Returns: List[ModelBO]: A list of model business object. """ # retrieve models = ModelService.get_models_by_task(task) # check if found if len(models) == 0: raise FileNotFoundError('Model not found!') _get_remote_model_weights(models) return models
def test_delete_dynamic_profiling_result(): model = ModelService.get_models('ResNet50')[0] dummy_info_tuple1 = InfoTuple(avg=1, p50=1, p95=1, p99=2) dummy_info_tuple2 = InfoTuple(avg=1, p50=1, p95=1, p99=1) dpr = DynamicProfileResultBO(device_id='gpu:02', device_name='Tesla K40c', batch=1, memory=ProfileMemory(1000, 1000, 0.5), latency=ProfileLatency( init_latency=dummy_info_tuple1, preprocess_latency=dummy_info_tuple2, inference_latency=dummy_info_tuple2, postprocess_latency=dummy_info_tuple2, ), throughput=ProfileThroughput( batch_formation_throughput=1, preprocess_throughput=1, inference_throughput=1, postprocess_throughput=1, )) ModelService.append_dynamic_profiling_result(model.id, dpr) # reload model = ModelService.get_models('ResNet50')[0] dpr_bo = model.profile_result.dynamic_results[0] dpr_bo2 = model.profile_result.dynamic_results[1] # check delete assert ModelService.delete_dynamic_profiling_result( model.id, dpr_bo.ip, dpr_bo.device_id) # check result model = ModelService.get_models('ResNet50')[0] assert len(model.profile_result.dynamic_results) == 1 dpr_left = model.profile_result.dynamic_results[0] assert dpr_bo2.latency.init_latency.avg == dpr_left.latency.init_latency.avg
def test_delete_dynamic_profiling_result(): model = ModelService.get_models_by_name('ResNet50')[0] dpr = DynamicProfileResultBO( 'gpu:02', 'Tesla K40c', 1, ProfileMemory(1000, 1000, 1000), ProfileLatency((1, 1, 2), (1, 1, 1), (1, 1, 1), (1, 1, 1)), ProfileThroughput((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1))) ModelService.append_dynamic_profiling_result(model.id, dpr) # reload model = ModelService.get_models_by_name('ResNet50')[0] dpr_bo = model.profile_result.dynamic_results[0] dpr_bo2 = model.profile_result.dynamic_results[1] # check delete assert ModelService.delete_dynamic_profiling_result( model.id, dpr_bo.ip, dpr_bo.device_id) # check result model = ModelService.get_models_by_name('ResNet50')[0] assert len(model.profile_result.dynamic_results) == 1 dpr_left = model.profile_result.dynamic_results[0] assert dpr_bo2.latency.init_latency.avg == dpr_left.latency.init_latency.avg
def register_model(origin_model, dataset: str, metric: Dict[Metric, float], task: Task, inputs: List[IOShape], outputs: List[IOShape], model_input: Optional[List] = None, architecture: str = None, framework: Framework = None, engine: Engine = None, version: ModelVersion = None, parent_model_id: Optional[str] = None, convert: bool = True, profile: bool = True, model_status: List[ModelStatus] = None): """Upload a model to ModelDB. This function will upload the given model into the database with some variation. It may optionally generate a branch of models (i.e. model family) with different optimization techniques. Besides, a benchmark will be scheduled for each generated model, in order to gain profiling results for model selection strategies. In the `no_generate` model(i.e. `no_generate` flag is set to be `True`), `architecture`, `framework`, `engine` and `version` could be None. If any of the above arguments is `None`, all of them will be auto induced from the origin_model path. An `ValueError` will be raised if the mata info cannot be induced. TODO: This function has a super comprehensive logic, need to be simplified. Arguments: origin_model: The uploaded model without optimization. When `no_generate` flag is set, this parameter should be a str indicating model file path. architecture (str): Model architecture name. Default to None. framework (Framework): Framework name. Default to None. version (ModelVersion): Model version. Default to None. dataset (str): Model testing dataset. metric (Dict[Metric,float]): Scoring metric and its corresponding score used for model evaluation task (Task): Model task type. inputs (Iterable[IOShape]): Model input tensors. outputs (Iterable[IOShape]): Model output tensors. model_input: specify sample model input data TODO: specify more model conversion related params engine (Engine): Model optimization engine. Default to `Engine.NONE`. parent_model_id (Optional[str]): the parent model id of current model if this model is derived from a pre-existing one model_status (List[ModelStatus]): Indicate the status of current model in its lifecycle convert (bool): Flag for generation of model family. When set, `origin_model` should be a path to model saving file. Default to `True`. profile (bool): Flag for profiling uploaded (including converted) models. Default to `False`. """ from modelci.controller import job_executor from modelci.controller.executor import Job model_dir_list = list() # type and existence check if isinstance(origin_model, str): model_dir = Path(origin_model).absolute() assert model_dir.exists( ), f'model weight does not exist at {origin_model}' if all([architecture, task, framework, engine, version]): # from explicit architecture, framework, engine and version ext = model_dir.suffix path = generate_path(architecture, task, framework, engine, version).with_suffix(ext) # if already in the destination folder if path == model_dir: pass # create destination folder else: if ext: path.parent.mkdir(parents=True, exist_ok=True) else: path.mkdir(parents=True, exist_ok=True) # copy to cached folder subprocess.call(['cp', model_dir, path]) else: # from implicit extracted from path, check validity of the path later at registration path = model_dir model_dir_list.append(path) elif framework == Framework.PYTORCH and engine in [ Engine.PYTORCH, Engine.NONE ]: # save original pytorch model pytorch_dir = generate_path( task=task, model_name=architecture, framework=framework, engine=engine, version=str(version), ) pytorch_dir.parent.mkdir(parents=True, exist_ok=True) save_path_with_ext = pytorch_dir.with_suffix('.pth') torch.save(origin_model, str(save_path_with_ext)) model_dir_list.append(pytorch_dir.with_suffix('.pth')) if convert: # TODO: generate from path name # generate model variant model_dir_list.extend( _generate_model_family(origin_model, architecture, task, framework, filename=str(version), inputs=inputs, outputs=outputs, model_input=model_input)) # register for model_dir in model_dir_list: parse_result = parse_path(model_dir) architecture = parse_result['architecture'] task = parse_result['task'] framework = parse_result['framework'] engine = parse_result['engine'] version = parse_result['version'] filename = parse_result['filename'] if model_status is not None: model_bo_status = model_status elif engine == Engine.PYTORCH: model_bo_status = [ModelStatus.PUBLISHED] else: model_bo_status = [ModelStatus.CONVERTED] with open(str(model_dir), 'rb') as f: model = ModelBO(name=architecture, task=task, framework=framework, engine=engine, version=version, dataset=dataset, metric=metric, parent_model_id=parent_model_id, inputs=inputs, outputs=outputs, model_status=model_bo_status, weight=Weight(f, filename=filename)) ModelService.post_model(model) # TODO refresh model = ModelService.get_models(name=architecture, task=task, framework=framework, engine=engine, version=version)[0] if model.engine == Engine.PYTORCH or model.engine == Engine.TFS: parent_model_id = model.id # profile registered model if profile and engine != Engine.PYTORCH: file = tf.keras.utils.get_file( "grace_hopper.jpg", "https://storage.googleapis.com/download.tensorflow.org/example_images/grace_hopper.jpg" ) test_img_bytes = cv2.imread(file) kwargs = { 'repeat_data': test_img_bytes, 'batch_size': 32, 'batch_num': 100, 'asynchronous': False, 'model_info': model, } new_status = [ item for item in model.model_status if item is not (ModelStatus.CONVERTED or ModelStatus.PUBLISHED) ] new_status.append(ModelStatus.PROFILING) model.model_status = new_status ModelService.update_model(model) if engine == Engine.TORCHSCRIPT: client = CVTorchClient(**kwargs) elif engine == Engine.TFS: client = CVTFSClient(**kwargs) elif engine == Engine.ONNX: client = CVONNXClient(**kwargs) elif engine == Engine.TRT: client = CVTRTClient(**kwargs) else: raise ValueError(f'No such serving engine: {engine}') job_cuda = Job(client=client, device='cuda:0', model_info=model) # job_cpu = Job(client=client, device='cpu', model_info=model) job_executor.submit(job_cuda)
def test_register_static_profiling_result(): model = ModelService.get_models('ResNet50')[0] spr = StaticProfileResultBO(5000, 200000, 200000, 10000, 10000, 10000) assert ModelService.register_static_profiling_result(model.id, spr)
def test_get_model_by_id(): model_bo = ModelService.get_models('ResNet50')[0] model = ModelService.get_model_by_id(model_bo.id) # check model id assert model.id == model_bo.id
def test_delete_model(): model = ModelService.get_models('ResNet50')[0] assert ModelService.delete_model_by_id(model.id)