def _train_single_model(self, model_desc=None, model_id=None, weights_file=None): cls_trainer = ClassFactory.get_cls('trainer') step_name = self.task.step_name if model_desc is not None: sample = dict(worker_id=model_id, desc=model_desc, step_name=step_name) record = ReportRecord().load_dict(sample) logging.debug("Broadcast Record=%s", str(record)) trainer = cls_trainer(model_desc=model_desc, id=model_id, pretrained_model_file=weights_file) else: trainer = cls_trainer(None, 0) record = ReportRecord(trainer.step_name, trainer.worker_id, desc=trainer.model_desc) ReportClient.broadcast(record) ReportServer.add_watched_var(trainer.step_name, trainer.worker_id) # resume training if vega.is_torch_backend() and General._resume: trainer.load_checkpoint = True trainer._resume_training = True if self._distributed_training: self._do_distributed_fully_train(trainer) else: self._do_single_fully_train(trainer)
def __init__(self): self.step_name = General.step_name self.search_space = SearchSpace() self.search_alg = SearchAlgorithm(self.search_space) self.report = Report() self.record = ReportRecord() self.record.step_name = self.step_name if hasattr(self.search_alg.config, 'objective_keys'): self.record.objective_keys = self.search_alg.config.objective_keys self.quota = QuotaCompare('restrict')
def __init__(self): self.step_name = General.step_name self.search_space = SearchSpace() self.search_alg = SearchAlgorithm(self.search_space) self.record = ReportRecord() self.record.step_name = self.step_name if hasattr(self.search_alg.config, 'objective_keys'): self.record.objective_keys = self.search_alg.config.objective_keys self.quota = QuotaCompare('restrict') self.affinity = None if General.quota.affinity.type is None else QuotaAffinity( General.quota.affinity)
def do(self): """Start to run benchmark evaluator.""" logger.info("BenchmarkPipeStep started...") records = self._get_current_step_records() if not records: logger.error("There is no model to evaluate.") return self.master = create_master() for record in records: _record = ReportRecord(worker_id=record.worker_id, desc=record.desc, step_name=record.step_name) Report().broadcast(_record) self._evaluate_single_model(record) self.master.join() for record in records: Report().update_report({ "step_name": record.step_name, "worker_id": record.worker_id }) Report().output_step_all_records(step_name=General.step_name, weights_file=False, performance=True) self.master.close_client() Report().backup_output_path()
def _evaluate_single_model(self, record): try: worker_info = { "step_name": record.step_name, "worker_id": record.worker_id } _record = dict(worker_id=record.worker_id, desc=record.desc, step_name=record.step_name) _init_record = ReportRecord().load_dict(_record) Report().broadcast(_init_record) if EvaluatorConfig.gpu_evaluator_enable: cls_evaluator = ClassFactory.get_cls(ClassType.GPU_EVALUATOR, "GpuEvaluator") evaluator = cls_evaluator(worker_info=worker_info, model_desc=record.desc, weights_file=record.weights_file) self.master.run(evaluator) if EvaluatorConfig.davinci_mobile_evaluator_enable: cls_evaluator = ClassFactory.get_cls( ClassType.DAVINCI_MOBILE_EVALUATOR, "DavinciMobileEvaluator") evaluator = cls_evaluator(worker_info=worker_info, model_desc=record.desc, weights_file=record.weights_file) self.master.run(evaluator) except Exception: logger.error( "Failed to evaluate model, worker info={}".format(worker_info)) logger.error(traceback.format_exc()) return
def _init_report(self): info = dict(worker_id=self.trainer.worker_id, desc=self.cfg.model_desc, step_name=self.trainer.step_name, weights_file=self.best_model_file) record = ReportRecord().load_dict(info) logging.debug("Broadcast Record=%s", str(record)) ReportClient.broadcast(record)
def __init__(self, search_space=None, **kwargs): """Init SearchAlgorithm.""" super(SearchAlgorithm, self).__init__() # modify config by kwargs, using local scope if self.config and kwargs: self.config.from_dict(kwargs) self.search_space = search_space if hasattr(self.config, 'codec'): self.codec = Codec(search_space, type=self.config.codec) else: self.codec = None logging.debug("Config=%s", self.config) self.record = ReportRecord() self.record.step_name = self.step_name self._get_search_space_list()
def do(self): """Start to run benchmark evaluator.""" logger.info("BenchmarkPipeStep started...") records = self._get_current_step_records() if not records: logger.error("There is no model to evaluate.") return self.master = create_master() for record in records: _record = ReportRecord(worker_id=record.worker_id, desc=record.desc, step_name=record.step_name) ReportClient().broadcast(_record) ReportServer().add_watched_var(record.step_name, record.worker_id) self._evaluate_single_model(record) self.master.join() ReportServer().output_step_all_records(step_name=General.step_name) self.master.close_client() ReportServer().backup_output_path()
def _get_current_step_records(self): step_name = self.task.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = ReportServer().get_pareto_front_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = ReportServer().load_records_from_model_folder( models_folder) else: records = [ReportRecord(step_name, 0)] logging.debug("Records: {}".format(records)) for record in records: record.step_name = step_name return records
def _train_single_model(self, model_desc=None, model_id=None): cls_trainer = ClassFactory.get_cls('trainer', "Trainer") step_name = self.task.step_name if model_desc is not None: sample = dict(worker_id=model_id, desc=model_desc, step_name=step_name) record = ReportRecord().load_dict(sample) logging.debug("Broadcast Record=%s", str(record)) Report().broadcast(record) trainer = cls_trainer(model_desc=model_desc, id=model_id) else: trainer = cls_trainer(None, 0) # resume training if vega.is_torch_backend() and General._resume: trainer.load_checkpoint = True trainer._resume_training = True if cls_trainer.config.distributed: self._do_distributed_fully_train() else: self._do_single_fully_train(trainer)
def _load_single_model_records(self): model_desc = PipeStepConfig.model.model_desc model_desc_file = PipeStepConfig.model.model_desc_file if model_desc_file: model_desc_file = model_desc_file.replace( "{local_base_path}", TaskOps().local_base_path) model_desc = Config(model_desc_file) if not model_desc: logger.error("Model desc or Model desc file is None.") return [] model_file = PipeStepConfig.model.pretrained_model_file if not model_file: logger.error("Model file is None.") return [] if not os.path.exists(model_file): logger.error("Model file is not existed.") return [] return [ ReportRecord().load_dict( dict(worker_id="1", desc=model_desc, weights_file=model_file)) ]
class Generator(object): """Convert search space and search algorithm, sample a new model.""" def __init__(self): self.step_name = General.step_name self.search_space = SearchSpace() self.search_alg = SearchAlgorithm(self.search_space) self.report = Report() self.record = ReportRecord() self.record.step_name = self.step_name if hasattr(self.search_alg.config, 'objective_keys'): self.record.objective_keys = self.search_alg.config.objective_keys self.quota = QuotaCompare('restrict') @property def is_completed(self): """Define a property to determine search algorithm is completed.""" return self.search_alg.is_completed or self.quota.is_halted() def sample(self): """Sample a work id and model from search algorithm.""" res = self.search_alg.search() if not res: return None if not isinstance(res, list): res = [res] if len(res) == 0: return None out = [] for sample in res: desc = sample.get("desc") if isinstance(sample, dict) else sample[1] desc = self._decode_hps(desc) model_desc = deepcopy(desc) if "modules" in desc: PipeStepConfig.model.model_desc = deepcopy(desc) elif "network" in desc: origin_desc = PipeStepConfig.model.model_desc desc = update_dict(desc["network"], origin_desc) PipeStepConfig.model.model_desc = deepcopy(desc) if self.quota.is_filtered(desc): continue record = self.record.from_sample(sample, desc) Report().broadcast(record) out.append((record.worker_id, model_desc)) return out def update(self, step_name, worker_id): """Update search algorithm accord to the worker path. :param step_name: step name :param worker_id: current worker id :return: """ report = Report() record = report.receive(step_name, worker_id) logging.debug("Get Record=%s", str(record)) self.search_alg.update(record.serialize()) report.dump_report(record.step_name, record) self.dump() logging.info("Update Success. step_name=%s, worker_id=%s", step_name, worker_id) logging.info("Best values: %s", Report().print_best(step_name=General.step_name)) @staticmethod def _decode_hps(hps): """Decode hps: `trainer.optim.lr : 0.1` to dict format. And convert to `zeus.common.config import Config` object This Config will be override in Trainer or Datasets class The override priority is: input hps > user configuration > default configuration :param hps: hyper params :return: dict """ hps_dict = {} if hps is None: return None if isinstance(hps, tuple): return hps for hp_name, value in hps.items(): hp_dict = {} for key in list(reversed(hp_name.split('.'))): if hp_dict: hp_dict = {key: hp_dict} else: hp_dict = {key: value} # update cfg with hps hps_dict = update_dict(hps_dict, hp_dict, []) return Config(hps_dict) def dump(self): """Dump generator to file.""" step_path = TaskOps().step_path _file = os.path.join(step_path, ".generator") with open(_file, "wb") as f: pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL) @classmethod def restore(cls): """Restore generator from file.""" step_path = TaskOps().step_path _file = os.path.join(step_path, ".generator") if os.path.exists(_file): with open(_file, "rb") as f: return pickle.load(f) else: return None