def work(self): project = ProjectProvider(self.session).by_id(self.project) self.info(f'Task = {self.train_task} child_task: {self.child_task}') model = Model(created=now(), name=self.name, project=self.project, equations='', fold=self.fold) provider = ModelProvider(self.session) if self.train_task: task_provider = TaskProvider(self.session) task = task_provider.by_id(self.train_task) model.score_local = task.score task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) src_log = f'{task_dir}/log' models_dir = join(MODEL_FOLDER, project.name) os.makedirs(models_dir, exist_ok=True) model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self, file=self.file) model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) file = self.file = 'best_full' shutil.copy(f'{src_log}/checkpoints/{file}.pth', model_weight_path) provider.add(model)
def work(self): project = ProjectProvider(self.session).by_id(self.project) self.info(f'Task = {self.train_task} child_task: {self.child_task}') model = Model( created=now(), name=self.name, project=self.project, equations='', fold=self.fold ) provider = ModelProvider(self.session) if self.train_task: task_provider = TaskProvider(self.session) dag_provider = DagProvider(self.session) task = task_provider.by_id(self.train_task) dag = dag_provider.by_id(task.dag) task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) # get log directory config = yaml_load(dag.config) executor_config = config['executors'][task.executor] catalyst_config_file = executor_config['args']['config'] catalyst_config_file = join(task_dir, catalyst_config_file) catalyst_config = yaml_load(file=catalyst_config_file) catalyst_logdir = catalyst_config['args']['logdir'] model.score_local = task.score src_log = f'{task_dir}/{catalyst_logdir}' models_dir = join(MODEL_FOLDER, project.name) os.makedirs(models_dir, exist_ok=True) model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self, file=self.file) model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) file = self.file = 'best_full' shutil.copy(f'{src_log}/checkpoints/{file}.pth', model_weight_path) provider.add(model)
def work(self): task_provider = TaskProvider(self.session) task = task_provider.by_id(self.train_task) dag = DagProvider(self.session).by_id(self.dag_pipe, joined_load=[Dag.project_rel]) task_dir = join(TASK_FOLDER, str(self.child_task or task.id)) src_log = f'{task_dir}/log' models_dir = join(MODEL_FOLDER, dag.project_rel.name) os.makedirs(models_dir, exist_ok=True) self.info(f'Task = {self.task} child_task: {self.child_task}') model_path_tmp = f'{src_log}/traced.pth' traced = trace_model_from_checkpoint(src_log, self) model = Model(dag=self.dag_pipe, interface=self.interface, slot=self.slot, score_local=task.score, created=now(), name=self.name, project=dag.project, interface_params=yaml_dump(self.interface_params)) provider = ModelProvider(self.session) provider.add(model, commit=False) try: model_path = f'{models_dir}/{model.name}.pth' model_weight_path = f'{models_dir}/{model.name}_weight.pth' torch.jit.save(traced, model_path_tmp) shutil.copy(model_path_tmp, model_path) shutil.copy(f'{src_log}/checkpoints/best.pth', model_weight_path) interface_params = yaml_load(model.interface_params) interface_params['file'] = join('models', model.name + '.pth') model.interface_params = yaml_dump(interface_params) provider.update() except Exception as e: provider.rollback() raise e
def dag_model_add(session: Session, data: dict): if not data.get('task'): model = Model(name=data['name'], project=data['project'], equations=data['equations'], created=now()) ModelProvider(session).add(model) return task_provider = TaskProvider(session) task = task_provider.by_id(data['task'], options=joinedload(Task.dag_rel, innerjoin=True)) child_tasks = task_provider.children(task.id) computer = task.computer_assigned child_task = None if len(child_tasks) > 0: child_task = child_tasks[0].id computer = child_tasks[0].computer_assigned project = ProjectProvider(session).by_id(task.dag_rel.project) config = { 'info': { 'name': 'model_add', 'project': project.name, 'computer': computer }, 'executors': { 'model_add': { 'type': 'model_add', 'project': data['project'], 'task': data.get('task'), 'name': data['name'], 'file': data['file'], 'child_task': child_task, 'fold': data['fold'] } } } dag_standard(session=session, config=config, debug=False, upload_files=False)
def trace_model( model: Model, runner: Runner, batch=None, method_name: str = "forward", mode: str = "eval", requires_grad: bool = False, opt_level: str = None, device: str = "cpu", predict_params: dict = None, ) -> ScriptModule: """ Traces model using runner and batch Args: model: Model to trace runner: Model's native runner that was used to train model batch: Batch to trace the model method_name (str): Model's method name that will be used as entrypoint during tracing mode (str): Mode for model to trace (``train`` or ``eval``) requires_grad (bool): Flag to use grads opt_level (str): Apex FP16 init level, optional device (str): Torch device predict_params (dict): additional parameters for model forward Returns: (ScriptModule): Traced model """ if batch is None or runner is None: raise ValueError("Both batch and runner must be specified.") if mode not in ["train", "eval"]: raise ValueError(f"Unknown mode '{mode}'. Must be 'eval' or 'train'") predict_params = predict_params or {} tracer = _TracingModelWrapper(model, method_name) if opt_level is not None: utils.assert_fp16_available() # If traced in AMP we need to initialize the model before calling # the jit # https://github.com/NVIDIA/apex/issues/303#issuecomment-493142950 from apex import amp model = model.to(device) model = amp.initialize(model, optimizers=None, opt_level=opt_level) # after fixing this bug https://github.com/pytorch/pytorch/issues/23993 params = {**predict_params, "check_trace": False} else: params = predict_params getattr(model, mode)() utils.set_requires_grad(model, requires_grad=requires_grad) _runner_model, _runner_device = runner.model, runner.device runner.model, runner.device = tracer, device runner.predict_batch(batch, **params) result: ScriptModule = tracer.tracing_result runner.model, runner.device = _runner_model, _runner_device return result