def terminate(self): if self._task: # trigger pytorch lighting training graceful shutdown via a ^C self._task.set_exception(KeyboardInterrupt()) model_train_curd.update(TrainingJobUpdate(_id=self._id, status=Status.FAIL)) model_bo = ModelService.get_model_by_id(self.model_id) model_bo.model_status.remove(ModelStatus.TRAINING) model_bo.model_status.append(ModelStatus.DRAFT) ModelService.update_model(model_bo)
def start(self): def training_done_callback(future): model_train_curd.update(TrainingJobUpdate(_id=self._id, status=Status.PASS)) # TODO: save to database and update model_status, engine print(self.export_model()) self._task = self._executor.submit(self.trainer_engine.fit, self.model, **self._data_loader_kwargs) self._task.add_done_callback(training_done_callback) model_train_curd.update(TrainingJobUpdate(_id=self._id, status=Status.RUNNING)) model_bo = ModelService.get_model_by_id(self.model_id) model_bo.model_status.remove(ModelStatus.DRAFT) model_bo.model_status.append(ModelStatus.TRAINING) ModelService.update_model(model_bo)
def from_training_job(cls, training_job: TrainingJob) -> 'PyTorchTrainer': # TODO: only support fine-tune model_bo = ModelService.get_model_by_id(training_job.model) if model_bo.engine != Engine.PYTORCH: raise ValueError( f'Model engine expected `{Engine.PYTORCH}`, but got {model_bo.engine}.' ) # download local cache cache_path = get_remote_model_weight(model_bo) net = torch.load(cache_path) freeze(module=net, n=-1, train_bn=True) # build pytorch lightning module fine_tune_module_kwargs = { 'net': net, 'loss': eval(str(training_job.loss_function))(), # nosec 'batch_size': training_job.data_module.batch_size, 'num_workers': training_job.data_module.num_workers, } if training_job.optimizer_property.lr: fine_tune_module_kwargs['lr'] = training_job.optimizer_property.lr if training_job.lr_scheduler_property.gamma: fine_tune_module_kwargs[ 'lr_scheduler_gamma'] = training_job.lr_scheduler_property.gamma if training_job.lr_scheduler_property.step_size: fine_tune_module_kwargs[ 'step_size'] = training_job.lr_scheduler_property.step_size model = FineTuneModule(**fine_tune_module_kwargs) data_module = PyTorchDataModule(**training_job.data_module.dict( exclude_none=True)) trainer_kwargs = training_job.dict( exclude_none=True, include={'min_epochs', 'max_epochs'}) trainer = cls( id=training_job.id, model=model, data_loader_kwargs={'datamodule': data_module}, trainer_kwargs={ 'default_root_dir': training_job.data_module.data_dir or OUTPUT_DIR, 'weights_summary': None, 'progress_bar_refresh_rate': 1, 'num_sanity_val_steps': 0, 'gpus': 1, # TODO: set GPU number **trainer_kwargs, }) return trainer
def generate_model_graph(*, id: str): # noqa model_bo = ModelService.get_model_by_id(id) dot_graph = '' if model_bo.engine == Engine.PYTORCH: pytorch_model = torch.load(model_bo.saved_path) sample_data = torch.zeros(1, *model_bo.inputs[0].shape[1:], dtype=torch.float, requires_grad=False) out = pytorch_model(sample_data) dot_graph = make_dot(out, params=dict( list(pytorch_model.named_parameters()) + [('x', sample_data)])) return {'dot': str(dot_graph)}
async def get_model_structure(id: str): # noqa """ Get model structure as a model structure graph (connection between layer as edge, layers as nodes) Arguments: id (str): Model object ID. """ # return model DAG model = ModelService.get_model_by_id(id) if model.engine != Engine.PYTORCH: raise ValueError(f'model {id} is not supported for editing. ' f'Currently only support model with engine=PYTORCH') # download model as local cache cache_path = get_remote_model_weight(model=model) net = torch.load(cache_path) return Structure.from_model(net)
def test_get_model_by_id(): model_bo = ModelService.get_models('ResNet50')[0] model = ModelService.get_model_by_id(model_bo.id) # check model id assert model.id == model_bo.id
def update_finetune_model_as_new(id: str, updated_layer: Structure, dry_run: bool = False): # noqa """ Temporary function for finetune CV models. The function's functionality is overlapped with `update_model_structure_as_new`. Please use the `update_model_structure_as_new` in next release. Examples: Fine-tune the model by modify the layer with name 'fc' (last layer). The layer has a changed argument out_features = 10. op_='M' indicates the operation to this layer ('fc') is 'Modify'. There is no changes in layer connections. Therefore, the structure change summary is [M] fc: (...) out_features=10 >>> from collections import OrderedDict >>> structure_data = { ... 'layer': OrderedDict({'fc': {'out_features': 10, 'op_': 'M', 'type_': 'torch.nn.Linear'}}) ... } >>> update_finetune_model_as_new(id=..., updated_layer=Structure.parse_obj(structure_data)) Args: id (str): ID of the model to be updated. updated_layer (Structure): Contains layers to be fine-tuned. dry_run (bool): Test run for verify if the provided parameter (i.e. model specified in `id` and updated layers) is valid. Returns: """ if len(updated_layer.layer.items()) == 0: return True model = ModelService.get_model_by_id(id) if model.engine != Engine.PYTORCH: raise ValueError(f'model {id} is not supported for editing. ' f'Currently only support model with engine=PYTORCH') # download model as local cache cache_path = get_remote_model_weight(model=model) net = torch.load(cache_path) for layer_name, layer_param in updated_layer.layer.items(): layer_op = getattr(layer_param, 'op_') # update layer if layer_op == Operation.MODIFY: # check if the layer name exists # TODO check if layer path exists eg."layer1.0.conv1" if not hasattr(net, layer_name): raise ModelStructureError( f'Structure layer name `{layer_name}` not found in model {id}.' ) net_layer = getattr(net, layer_name) # check if the provided type matches the original type layer_type = type(net_layer) layer_type_provided = eval(layer_param.type_.value) # nosec if layer_type is not layer_type_provided: raise ModelStructureError( f'Expect `{layer_name}.type_` to be {layer_type}, ' f'but got {layer_type_provided}') # get layer parameters layer_param_old = layer_param.parse_layer_obj(net_layer) layer_param_data = layer_param_old.dict(exclude_none=True, exclude={'type_', 'op_'}) layer_param_update_data = layer_param.dict( exclude_none=True, exclude={'type_', 'op_'}) # replace 'null' with None. See reason :class:`ModelLayer`. for k, v in layer_param_update_data.items(): if v == 'null': layer_param_update_data[k] = None # update the layer parameters layer_param_data.update(layer_param_update_data) layer = layer_type(**layer_param_data) setattr(net, layer_name, layer) else: # if layer_op is Operation.ADD, # 1. check if the layer name not exists # 2. add a layer # 3. change the `forward` function according to the connections # if layer_op is Operation.DELETE, # 1. check if the layer exists # 2. delete the layer # 3. change the `forward` function raise ValueError( 'Operation not permitted. Please use `update_model_structure_as_new`.' ) input_tensors = list() bs = 1 for input_ in model.inputs: input_tensor = torch.rand(bs, *input_.shape[1:]).type( model_data_type_to_torch(input_.dtype)) input_tensors.append(input_tensor) # parse output tensors output_shapes = list() output_tensors = net(*input_tensors) if not isinstance(output_tensors, (list, tuple)): output_tensors = (output_tensors, ) for output_tensor in output_tensors: output_shape = IOShape(shape=[bs, *output_tensor.shape[1:]], dtype=type_to_data_type(output_tensor.dtype)) output_shapes.append(output_shape) if not dry_run: # TODO return validation result for dry_run mode # TODO apply Semantic Versioning https://semver.org/ # TODO reslove duplicate model version problem in a more efficient way version = ModelVersion(model.version.ver + 1) previous_models = ModelService.get_models( architecture=model.architecture, task=model.task, framework=model.framework, engine=Engine.NONE) if len(previous_models): last_version = max(previous_models, key=lambda k: k.version.ver).version.ver version = ModelVersion(last_version + 1) saved_path = generate_path_plain(architecture=model.architecture, task=model.task, framework=model.framework, engine=Engine.NONE, version=version) saved_path.parent.mkdir(parents=True, exist_ok=True) torch.save(model, saved_path.with_suffix('.pt')) mlmodelin = MLModel(dataset='', metric={key: 0 for key in model.metric.keys()}, task=model.task, inputs=model.inputs, outputs=output_shapes, architecture=model.name, framework=model.framework, engine=Engine.NONE, model_status=[ModelStatus.DRAFT], parent_model_id=model.id, version=version, weight=saved_path) register_model(mlmodelin, convert=False, profile=False) model_bo = ModelService.get_models(architecture=model.architecture, task=model.task, framework=model.framework, engine=Engine.NONE, version=version)[0] return {'id': model_bo.id}
def get_model(*, id: str): # noqa model = ModelService.get_model_by_id(id) return ModelDetailOut.from_bo(model)