def _train_single_model(self, model_desc=None, model_id=None, weights_file=None): cls_trainer = ClassFactory.get_cls(ClassType.TRAINER, PipeStepConfig.trainer.type) step_name = self.task.step_name if model_desc is not None: sample = dict(worker_id=model_id, desc=model_desc, step_name=step_name) record = ReportRecord().load_dict(sample) logging.debug("update record=%s", str(record)) trainer = cls_trainer(model_desc=model_desc, id=model_id, pretrained_model_file=weights_file) else: trainer = cls_trainer(None, 0) record = ReportRecord(trainer.step_name, trainer.worker_id, desc=trainer.model_desc) ReportClient().update(**record.to_dict()) # resume training if vega.is_torch_backend() and General._resume: trainer.load_checkpoint = True trainer._resume_training = True if self._distributed_training: self._do_distributed_fully_train(trainer) else: self._do_single_fully_train(trainer)
def _evaluate_single_model(self, record): try: worker_info = { "step_name": record.step_name, "worker_id": record.worker_id } _record = dict(worker_id=record.worker_id, desc=record.desc, step_name=record.step_name) ReportClient().update(**_record) if EvaluatorConfig.host_evaluator_enable: cls_evaluator = ClassFactory.get_cls(ClassType.HOST_EVALUATOR, "HostEvaluator") evaluator = cls_evaluator(worker_info=worker_info, model_desc=record.desc, weights_file=record.weights_file) self.master.run(evaluator) if EvaluatorConfig.device_evaluator_enable: cls_evaluator = ClassFactory.get_cls( ClassType.DEVICE_EVALUATOR, "DeviceEvaluator") evaluator = cls_evaluator(worker_info=worker_info, model_desc=record.desc, weights_file=record.weights_file) self.master.run(evaluator) except Exception: logger.error( "Failed to evaluate model, worker info={}".format(worker_info)) logger.error(traceback.format_exc()) return
def _update(self, step_name, worker_id): # Waiting report thread update all record ReportClient().set_finished(step_name, worker_id) if not self.update_func: return if self.update_func.__code__.co_varnames.index("step_name") == 1: self.update_func(step_name, worker_id) else: self.update_func({"step_name": step_name, "worker_id": worker_id})
def update(self, step_name, worker_id): """Update search algorithm accord to the worker path. :param step_name: step name :param worker_id: current worker id :return: """ record = ReportClient().get_record(step_name, worker_id) logging.debug("Get Record=%s", str(record)) self.search_alg.update(record.serialize()) try: self.dump() except TypeError: logging.warning( "The Generator contains object which can't be pickled.") logging.info( f"Update Success. step_name={step_name}, worker_id={worker_id}") logging.info("Best values: %s", ReportServer().print_best(step_name=General.step_name))
def _train_single_model(self, model_desc, model_id, hps, multi_task): cls_trainer = ClassFactory.get_cls(ClassType.TRAINER, PipeStepConfig.trainer.type) step_name = self.task.step_name sample = dict(worker_id=model_id, desc=model_desc, step_name=step_name) record = ReportRecord().load_dict(sample) logging.debug("update record=%s", str(record)) trainer = cls_trainer(model_desc=model_desc, id=model_id, hps=hps, multi_task=multi_task) ReportClient().update(**record.to_dict()) if self._distributed_training: self._do_distributed_fully_train(trainer) else: self._do_single_fully_train(trainer)
def sample(self): """Sample a work id and model from search algorithm.""" for _ in range(10): res = self.search_alg.search() if not res: return None if not isinstance(res, list): res = [res] if len(res) == 0: return None out = [] for sample in res: if isinstance(sample, dict): id = sample["worker_id"] desc = self._decode_hps(sample["encoded_desc"]) sample.pop("worker_id") sample.pop("encoded_desc") kwargs = sample sample = _split_sample((id, desc)) else: kwargs = {} sample = _split_sample(sample) if hasattr(self, "objective_keys") and self.objective_keys: kwargs["objective_keys"] = self.objective_keys (id, desc, hps) = sample if "modules" in desc: PipeStepConfig.model.model_desc = deepcopy(desc) elif "network" in desc: origin_desc = PipeStepConfig.model.model_desc model_desc = update_dict(desc["network"], origin_desc) PipeStepConfig.model.model_desc = model_desc desc.pop('network') desc.update(model_desc) if self.quota.is_filtered(desc): continue if self.affinity and not self.affinity.is_affinity(desc): continue ReportClient().update(General.step_name, id, desc=desc, hps=hps, **kwargs) out.append((id, desc, hps)) if out: break return out
def train_process(self): """Validate process for the model validate worker.""" init_log( level=General.logger.level, log_file=f"{self.step_name}_device_evaluator_{self.worker_id}.log", log_path=self.local_log_path) logging.info("start Davinci or mobile evaluate process") self.load_model() self.valid_loader = self._init_dataloader(mode='test') performance = self.valid() ReportClient().update(self.step_name, self.worker_id, performance=performance) logging.info( f"finished device evaluation, id: {self.worker_id}, performance: {performance}" )
def after_train(self, logs=None): """Be called after Training.""" self.trainer._backup() self.wrp_trainer.notify('after_train', { 'logs': logs, }) if self.estim_th: self.estim_th.join() ret = self.estim_ret.get('final') self.trainer.performance = {'default': ret.get('best_score')} desc = self.trainer.model_desc.copy() desc['custom']['arch_desc'] = ret.get('best_arch_desc') # force update trainer record ReportClient().update(self.trainer.step_name, self.trainer.worker_id, desc=desc)
def epoch_end(self, run_context): """Be called after each epoch.""" cb_params = run_context.original_args() metric = self.model.eval(self.eval_dataset, dataset_sink_mode=self.dataset_sink_mode) logging.info( "Current epoch : [{}/{}], current valid metric {}.".format( cb_params.cur_epoch_num, cb_params.epoch_num, metric)) self.trainer.performance.update(metric) if self.trainer.distributed and os.environ["DEVICE_ID"] != "0": return else: ReportClient().update(self.trainer.step_name, self.trainer.worker_id, num_epochs=cb_params.epoch_num, current_epoch=cb_params.cur_epoch_num, performance=self.trainer.performance)
def do(self): """Start to run benchmark evaluator.""" logger.info("BenchmarkPipeStep started...") records = self._get_current_step_records() if not records: logger.error("There is no model to evaluate.") return self.update_status(Status.running) self.master = create_master() for record in records: ReportClient().update(record.step_name, record.worker_id, desc=record.desc) self._evaluate_single_model(record) self.master.join() ReportServer().output_step_all_records(step_name=General.step_name) self.master.close() ReportServer().backup_output_path() self.update_status(Status.finished)
def _save_best(self, desc): ReportClient().update(self.step_name, self.sample_count + 1, performance={"accuracy": 100}, desc=desc)
def _update_report(self, epoch, performance): record = ReportClient().update(self.trainer.step_name, self.trainer.worker_id, performance=performance) logging.debug("report_callback record: {}".format(record))
def _init_report(self): record = ReportClient().update(worker_id=self.trainer.worker_id, desc=self.cfg.model_desc, step_name=self.trainer.step_name, weights_file=self.best_model_file) logging.debug("update record=%s", str(record))