def do(self): """Start to run benchmark evaluator.""" logger.info("BenchmarkPipeStep started...") records = self._get_current_step_records() if not records: logger.error("There is no model to evaluate.") return self.master = Master() for record in records: _record = ReportRecord(worker_id=record.worker_id, desc=record.desc, step_name=record.step_name) Report().broadcast(_record) self._evaluate_single_model(record) self.master.pop_all_finished_evaluate_worker() self.master.join() self.master.pop_all_finished_evaluate_worker() for record in records: Report().update_report({ "step_name": record.step_name, "worker_id": record.worker_id }) Report().output_step_all_records(step_name=General.step_name, weights_file=False, performance=True) self.master.close_client() Report().backup_output_path()
def _broadcast(self, pfms): """Boadcase pfrm to record.""" record = Report().receive(self.step_name, self.worker_id) if record.performance: record.performance.update(pfms) else: record.performance = pfms Report().broadcast(record) logging.debug("valid record: {}".format(record))
def search(self): """Search one mutated model. :return: current number of samples, and the model """ desc = deepcopy(self.search_space) search_desc = desc.custom records = Report().get_pareto_front_records(['random', 'mutate']) codes = [] for record in records: custom = record.desc['custom'] codes.append(custom['code']) num_ops = len(search_desc.op_names) upper_bounds = [ num_ops, 2, 2, num_ops, num_ops, 5, 5, num_ops, num_ops, 8, 8, num_ops, num_ops, 4, 4, 5, 5, 6, 6 ] code_to_mutate = random.choice(codes) index = random.randrange(len(upper_bounds)) choices = list(range(upper_bounds[index])) choices.pop(int(code_to_mutate[index + 1], 36)) choice = random.choice(choices) code_mutated = code_to_mutate[:index + 1] + str(choice) + code_to_mutate[index + 2:] search_desc['code'] = code_mutated search_desc['method'] = "mutate" logging.info("Mutate from {} to {}".format(code_to_mutate, code_mutated)) search_desc = self.codec.decode(search_desc) self.sample_count += 1 desc['custom'] = search_desc return self.sample_count, desc
def search(self): """Search one NetworkDesc from search space. :return: search id, network desc :rtype: int, NetworkDesc """ desc = deepcopy(self.search_space) if self.random_count < self.random_models: codec = self._random_sample() desc.update({"trainer.codec": codec}) return self.random_count, desc self.ea_epoch += 1 # todo: according to gflops and acc. records = Report().get_pareto_front_records(self.step_name, self.num_individual) codes = [record.desc.get('nbit_w_list') + record.desc.get('nbit_a_list') for record in records] logging.info("codes=%s", codes) if len(codes) < 2: encoding1, encoding2 = codes[0], codes[0] else: encoding1, encoding2 = random.sample(codes, 2) choice = random.randint(0, 1) # mutate if choice == 0: encoding_new = self.mutatation(encoding1) # crossover else: encoding_new, _ = self.crossover(encoding1, encoding2) self.ea_count += 1 if self.ea_count % self.num_individual == 0: self.ea_epoch += 1 codec = self.codec.decode(encoding_new) desc.update({"trainer.codec": codec}) return self.random_count + self.ea_count, desc
def _get_current_step_records(self): step_name = General.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = Report().get_step_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = Report().load_records_from_model_folder(models_folder) else: records = self._load_single_model_records() final_records = [] for record in records: if not record.weights_file: logger.error("Model file is not existed, id={}".format( record.worker_id)) else: record.step_name = General.step_name final_records.append(record) logging.debug("Records: {}".format(final_records)) return final_records
def search(self): """Search one mutated model. :return: current number of samples, and the model """ desc = deepcopy(self.search_space) search_desc = desc.custom # TODO: merge sr ea in one pipe step. records = Report().get_pareto_front_records(['random', 'mutate']) codes = [] for record in records: codes.append(record.desc['custom']['code']) code_to_mutate = random.choice(codes) current_mutate, code_mutated = 0, code_to_mutate num_candidates = len(search_desc["candidates"]) while current_mutate < self.num_mutate: code_new = self.mutate_once(code_mutated, num_candidates) if code_new != code_mutated: current_mutate += 1 code_mutated = code_new logging.info("Mutate from {} to {}".format(code_to_mutate, code_mutated)) search_desc['code'] = code_mutated search_desc['method'] = "mutate" search_desc = self.codec.decode(search_desc) desc['custom'] = search_desc self.sample_count += 1 return dict(worker_id=self.sample_count, desc=desc)
def _evaluate_single_model(self, record): try: cls_gpu_evaluator = ClassFactory.get_cls(ClassType.GPU_EVALUATOR) except Exception: logger.error( "Failed to create Evaluator, please check the config file.") logger.error(traceback.format_exc()) return try: worker_info = { "step_name": record.step_name, "worker_id": record.worker_id } _record = dict(worker_id=record.worker_id, desc=record.desc, step_name=record.step_name) _init_record = ReportRecord().load_dict(_record) Report().broadcast(_init_record) evaluator = cls_gpu_evaluator(worker_info=worker_info, model_desc=record.desc, weights_file=record.weights_file) self.master.run(evaluator) except Exception: logger.error( "Failed to evaluate model, worker info={}".format(worker_info)) logger.error(traceback.format_exc()) return
def do(self): """Do the main task in this pipe step.""" logging.debug("NasPipeStep started...") while not self.generator.is_completed: res = self.generator.sample() if res: self._dispatch_trainer(res) else: time.sleep(0.5) self._after_train(wait_until_finish=False) self.master.join() self._after_train(wait_until_finish=True) logging.debug("Pareto_front values: %s", Report().pareto_front(General.step_name)) Report().output_pareto_front(General.step_name) self.master.close_client()
def search(self): """Search one NetworkDesc from search space. :return: search id, network desc :rtype: int, NetworkDesc """ if self.random_count < self.random_models: self.random_count += 1 desc = self._random_sample() desc.update({"trainer.codec": dict(desc)}) return self.random_count, desc self.ea_epoch += 1 records = Report().get_pareto_front_records(self.step_name, self.num_individual) codes = [record.desc.get('backbone').get('encoding') for record in records] logging.info("codes=%s", codes) if len(codes) < 2: encoding1, encoding2 = codes[0], codes[0] else: encoding1, encoding2 = random.sample(codes, 2) choice = random.randint(0, 1) # mutate if choice == 0: encoding_new = self.mutatation(encoding1) # crossover else: encoding_new, _ = self.crossover(encoding1, encoding2) self.ea_count += 1 if self.ea_count % self.num_individual == 0: self.ea_epoch += 1 desc = self.codec.decode(encoding_new) desc.update({"trainer.codec": dict(desc)}) return self.random_count + self.ea_count, desc
def get_pareto_front(self): """Get the pareto front of trained candidates.""" records = Report().get_pareto_front_records() codes = [] for record in records: codes.append(record.desc['code']) code_to_mutate = random.choice(codes) return code_to_mutate
def do(self): """Start to run fully train with horovod or local trainer.""" logger.info("FullyTrainPipeStep started...") cls_trainer = ClassFactory.get_cls('trainer') if cls_trainer.config.distributed: self._do_distributed_fully_train() else: records = self._get_current_step_records() logger.debug("load pipestep records: {}".format(records)) self.master = Master() self._train_multi_models(records) for record in records: Report().update_report({"step_name": record.step_name, "worker_id": record.worker_id}) Report().output_step_all_records( step_name=self.task.step_name, weights_file=True, performance=True) self.master.close_client() Report().backup_output_path()
def __init__(self): self.step_name = General.step_name self.search_space = SearchSpace() self.search_alg = SearchAlgorithm(self.search_space.search_space) self.report = Report() self.record = ReportRecord() self.record.step_name = self.step_name if hasattr(self.search_alg.config, 'objective_keys'): self.record.objective_keys = self.search_alg.config.objective_keys
def _train_multi_models(self, records): for record in records: self._train_single_model(record.desc, record.worker_id) finished_worker_info = self.master.pop_finished_worker() Report().update_report(finished_worker_info) self.master.join() self.master.pop_all_finished_train_worker() if not self.need_evaluate: return for record in records: self._evaluate_single_model(record) self.master.pop_all_finished_evaluate_worker() self.master.join() self.master.pop_all_finished_evaluate_worker()
def _train_single_model(self, model_desc=None, model_id=None): cls_trainer = ClassFactory.get_cls('trainer') step_name = self.task.step_name if model_desc is not None: sample = dict(worker_id=model_id, desc=model_desc, step_name=step_name) record = ReportRecord().load_dict(sample) logging.debug("Broadcast Record=%s", str(record)) Report().broadcast(record) model = NetworkDesc(model_desc).to_model() trainer = cls_trainer(model, model_id) else: trainer = cls_trainer(None, 0) if cls_trainer.config.distributed: self._do_distributed_fully_train() else: self._do_single_fully_train(trainer)
def _init_evaluator(self): """Do evaluate stuff. :param finished_trainer_info: the finished trainer info :type: list or dict """ use_evaluator, cls_evaluator_set = self._use_evaluator() if not use_evaluator: return record = Report().receive(self.step_name, self.worker_id) model_desc = record.desc model = NetworkDesc(model_desc).to_model() for cls in cls_evaluator_set: evaluator = cls(worker_info=self.worker_info, model=model) self.add_evaluator(evaluator)
def __init__(self, search_space=None, **kwargs): """Init SearchAlgorithm.""" super(SearchAlgorithm, self).__init__() # modify config by kwargs, using local scope if self.config and kwargs: self.config = self.config() load_conf_from_desc(self.config, kwargs) self.search_space = search_space if hasattr(self.config, 'codec'): self.codec = Codec(search_space, type=self.config.codec) else: self.codec = None logging.debug("Config=%s", obj2config(self.config)) self.report = Report() self.record = ReportRecord() self.record.step_name = self.step_name
def sample(self): """Sample a work id and model from search algorithm.""" res = self.search_alg.search() if not res: return None if not isinstance(res, list): res = [res] out = [] for sample in res: if isinstance(sample, tuple): sample = dict(worker_id=sample[0], desc=sample[1]) record = self.record.load_dict(sample) logging.debug("Broadcast Record=%s", str(record)) Report().broadcast(record) desc = self._decode_hps(record.desc) out.append((record.worker_id, desc)) return out
def _get_current_step_records(self): step_name = self.task.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") records = [] cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = Report().get_pareto_front_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace( "{local_base_path}", TaskOps().local_base_path) records = Report().load_records_from_model_folder(models_folder) else: records = [ReportRecord(step_name, 0)] logging.debug("Records: {}".format(records)) for record in records: record.step_name = step_name return records
def update(self, step_name, worker_id): """Update search algorithm accord to the worker path. :param step_name: step name :param worker_id: current worker id :return: """ report = Report() record = report.receive(step_name, worker_id) logging.debug("Get Record=%s", str(record)) self.search_alg.update(record.serialize()) report.dump_report(record.step_name, record) logging.info("Update Success. step_name=%s, worker_id=%s", step_name, worker_id) logging.info("Best values: %s", Report().pareto_front(step_name=General.step_name))
def _broadcast(self, epoch=None): record = Report().receive(self.trainer.step_name, self.trainer.worker_id) if self.trainer.config.report_on_epoch: record.epoch = epoch # todo: remove in FinedGrainedSpace if self.trainer.config.codec: record.desc = self.trainer.config.codec if not record.desc: record.desc = self.trainer.model_desc record.performance = self.trainer.performance record.objectives = self.trainer.valid_metrics.objectives if record.performance is not None: for key in record.performance: if key not in record.objectives: if (key == 'gflops' or key == 'kparams'): record.objectives.update({key: 'MIN'}) else: record.objectives.update({key: 'MAX'}) record.model_path = self.trainer.model_path record.checkpoint_path = self.trainer.checkpoint_file record.weights_file = self.trainer.weights_file Report().broadcast(record) logging.debug("report_callback record: {}".format(record))
def _save_best(self, desc): record = Report().receive(self.step_name, self.sample_count + 1) record.performance = {"accuracy": 100} record.desc = desc Report().broadcast(record)
def after_train(self, logs=None): """Close the connection of report.""" self._broadcast(self.epoch) Report().close(self.trainer.step_name, self.trainer.worker_id)