def search(self): """Search a sample.""" if self.need_reignite_queue: # share weights and use reignite model worker_id, desc = self.need_reignite_queue.popitem() reignite_record = ReportServer().get_record( self.step_name, worker_id) desc['network.backbone.code'] = reignite_record.desc['code'] desc['network.backbone.weight_file'] = reignite_record.weights_file self.finished_reignite_queue[worker_id] = desc logging.info( "Finished reignited models, work_id:{} desc:{}".format( worker_id, desc)) return dict(worker_id=worker_id, encoded_desc=desc) pareto_records = ReportServer().get_pareto_front_records( choice='normal') best_record = pareto_records[0] if pareto_records else None desc = self.search_space.sample() arch_code = desc.get('network.backbone.code') if best_record: desc['network.backbone.weight_file'] = best_record.weights_file if best_record or not self.config.retain_original_code: desc['network.backbone.code'] = self._mutate_serialnet(arch_code) self.sample_count += 1 if self.config.reignite and self.config.reignite_desc: return self._reignite(desc) logging.info("desc:{}".format(desc)) return dict(worker_id=self.sample_count, encoded_desc=desc)
def shutdown_cluster(): """Shutdown all distributed cluster.""" # detect master is running if not General._parallel: return try: logging.info("Try to shutdown cluster.") # stop ReportServer from zeus.report import ReportServer ReportServer.stop() # stop Master from zeus.trainer.utils import load_master_ip from distributed import Client ip, port = load_master_ip() if ip is None or port is None: logging.info("Stand-alone mode, no need to shut down the cluster.") return shutdown_client = Client("{}:{}".format(ip, port)) logging.info("Cluster will be shut down.") shutdown_client.shutdown() shutdown_client.close() del shutdown_client time.sleep(15) logging.info("Cluster is shut down.") except Exception as e: logging.error("Pipeline's cluster shutdown error: {}".format(str(e))) logging.error(traceback.format_exc())
def _train_single_model(self, model_desc=None, model_id=None, weights_file=None): cls_trainer = ClassFactory.get_cls('trainer') step_name = self.task.step_name if model_desc is not None: sample = dict(worker_id=model_id, desc=model_desc, step_name=step_name) record = ReportRecord().load_dict(sample) logging.debug("Broadcast Record=%s", str(record)) trainer = cls_trainer(model_desc=model_desc, id=model_id, pretrained_model_file=weights_file) else: trainer = cls_trainer(None, 0) record = ReportRecord(trainer.step_name, trainer.worker_id, desc=trainer.model_desc) ReportClient.broadcast(record) ReportServer.add_watched_var(trainer.step_name, trainer.worker_id) # resume training if vega.is_torch_backend() and General._resume: trainer.load_checkpoint = True trainer._resume_training = True if self._distributed_training: self._do_distributed_fully_train(trainer) else: self._do_single_fully_train(trainer)
def _simulate_tiny_pipeline(self, cfg_tiny): """Simulate tiny pipeline by using one sample one epoch.""" report = ReportServer() for i, step_name in enumerate(PipelineConfig.steps): step_cfg = cfg_tiny.get(step_name) if step_cfg.pipe_step.type != 'SearchPipeStep': continue step_cfg.trainer.distributed = False step_cfg.trainer.epochs = 1 self.restrict_config.trials[step_name] = 1 General.step_name = step_name PipeStepConfig.from_dict(step_cfg) pipestep = PipeStep() if i == 0: pipestep.do() record = report.get_step_records(step_name)[-1] self.epoch_time = record.runtime _worker_path = TaskOps().local_base_path if os.path.exists(_worker_path): os.system('rm -rf {}'.format(_worker_path)) if step_cfg.pipe_step.type == 'SearchPipeStep': self.params_dict[step_name][ 'max_samples'] = pipestep.generator.search_alg.max_samples _file = os.path.join(TaskOps().step_path, ".generator") if os.path.exists(_file): os.system('rm {}'.format(_file))
def _dispatch_trainer(self, samples): for (id, desc, hps) in samples: cls_trainer = ClassFactory.get_cls(ClassType.TRAINER) TrainerConfig.from_dict(self.user_trainer_config) trainer = cls_trainer(id=id, model_desc=desc, hps=hps) evaluator = self._get_evaluator(trainer) logging.info("submit trainer, id={}".format(id)) ReportServer.add_watched_var(General.step_name, trainer.worker_id) self.master.run(trainer, evaluator)
def __init__(self): """Initialize.""" super().__init__() if not hasattr(self, "generator"): self.generator = Generator.restore() if not self.generator: self.generator = Generator() ReportServer.restore() self.master = create_master(update_func=self.generator.update) self.user_trainer_config = TrainerConfig().to_dict()
def do(self): """Start to run fully train with horovod or local trainer.""" logger.info("TrainPipeStep started...") records = self._get_current_step_records() logger.debug("load pipestep records: {}".format(records)) self.master = create_master() self._train_multi_models(records) self.master.join() ReportServer().output_step_all_records(step_name=self.task.step_name) self.master.close_client() ReportServer().backup_output_path()
def close_client(self): """Close cluster client.""" ReportServer.stop() self._thread_runing = False # Waiting thread exit. time.sleep(1) if hasattr(self, "client") and self.client is not None: self.client.close() del self.client # Waiting cluster closed time.sleep(1)
def search(self): """Search code of one model. :return: searched code of the model. """ records = ReportServer().get_pareto_front_records(['nas']) encodings = [] for record in records: custom = record.desc['custom'] encodings.append(custom['encoding']) pareto_front = encodings model_str = random.choice(pareto_front) print('model_str', model_str) ratio = np.random.randint(low=0, high=self.num_transform + 1) print('ratio', ratio) context_path = model_str[0].split('_') spatial_path = model_str[1].split('_') print('context_path', context_path) print('spatial_path', spatial_path) spatial_path = self.mutate_channel(spatial_path) context_path[-1] = self.do_transform(context_path[-1], num_mutate=ratio) spatial_path[-1] = self.do_transform(spatial_path[-1], num_mutate=self.num_transform - ratio) encoding = ['_'.join(context_path), '_'.join(spatial_path)] logging.info("Mutate from {} to {}".format(model_str, encoding)) return encoding
def search(self): """Search one mutated model. :return: current number of samples, and the model """ desc = deepcopy(self.search_space) search_desc = desc.custom records = ReportServer().get_pareto_front_records(['random', 'mutate']) codes = [] for record in records: custom = record.desc['custom'] codes.append(custom['code']) num_ops = len(search_desc.op_names) upper_bounds = [num_ops, 2, 2, num_ops, num_ops, 5, 5, num_ops, num_ops, 8, 8, num_ops, num_ops, 4, 4, 5, 5, 6, 6] code_to_mutate = random.choice(codes) index = random.randrange(len(upper_bounds)) choices = list(range(upper_bounds[index])) choices.pop(int(code_to_mutate[index + 1], 36)) choice = random.choice(choices) code_mutated = code_to_mutate[:index + 1] + str(choice) + code_to_mutate[index + 2:] search_desc['code'] = code_mutated search_desc['method'] = "mutate" logging.info("Mutate from {} to {}".format(code_to_mutate, code_mutated)) search_desc = self.codec.decode(search_desc) self.sample_count += 1 desc['custom'] = search_desc return self.sample_count, desc
def search(self): """Search one NetworkDesc from search space. :return: search id, network desc :rtype: int, NetworkDesc """ if self.random_count < self.random_models: self.random_count += 1 desc = self._random_sample() # desc.update({"trainer.codec": dict(desc)}) return self.random_count, desc records = ReportServer().get_pareto_front_records(self.step_name, self.num_individual) codes = [record.desc.get('backbone').get('encoding') for record in records] logging.info("codes=%s", codes) if len(codes) < 2: encoding1, encoding2 = codes[0], codes[0] else: encoding1, encoding2 = random.sample(codes, 2) choice = random.randint(0, 1) # mutate if choice == 0: encoding_new = self.mutatation(encoding1) # crossover else: encoding_new, _ = self.crossover(encoding1, encoding2) self.ea_count += 1 if self.ea_count % self.num_individual == 0: self.ea_epoch += 1 desc = self.codec.decode(encoding_new) return self.random_count + self.ea_count, desc
def _get_current_step_records(self): step_name = General.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = ReportServer().load_records_from_model_folder( models_folder) else: records = self._load_single_model_records() final_records = [] for record in records: if not record.weights_file: logger.error("Model file is not existed, id={}".format( record.worker_id)) else: record.step_name = General.step_name final_records.append(record) logging.debug("Records: {}".format(final_records)) return final_records
def search(self): """Search one mutated model. :return: current number of samples, and the model """ desc = deepcopy(self.search_space) search_desc = desc.custom # TODO: merge sr ea in one pipe step. records = ReportServer().get_pareto_front_records(['random', 'mutate']) codes = [] for record in records: codes.append(record.desc['custom']['code']) code_to_mutate = random.choice(codes) current_mutate, code_mutated = 0, code_to_mutate num_candidates = len(search_desc["candidates"]) while current_mutate < self.num_mutate: code_new = self.mutate_once(code_mutated, num_candidates) if code_new != code_mutated: current_mutate += 1 code_mutated = code_new logging.info("Mutate from {} to {}".format(code_to_mutate, code_mutated)) search_desc['code'] = code_mutated search_desc['method'] = "mutate" search_desc = self.codec.decode(search_desc) desc['custom'] = search_desc self.sample_count += 1 return dict(worker_id=self.sample_count, encoded_desc=desc)
def do(self): """Do the main task in this pipe step.""" logging.debug("SearchPipeStep started...") while not self.generator.is_completed: res = self.generator.sample() if res: self._dispatch_trainer(res) else: time.sleep(0.2) self.master.join() logging.debug("Pareto_front values: %s", ReportServer().pareto_front(General.step_name)) ReportServer().output_pareto_front(General.step_name) self.master.close_client() if General.clean_worker_dir: self._clean_checkpoint()
def get_pareto_front(self): """Get the pareto front of trained candidates.""" records = ReportServer().get_pareto_front_records() codes = [] for record in records: codes.append(record.desc['code']) code_to_mutate = random.choice(codes) return code_to_mutate
def update(self, step_name, worker_id): """Update search algorithm accord to the worker path. :param step_name: step name :param worker_id: current worker id :return: """ record = ReportClient.get_record(step_name, worker_id) logging.debug("Get Record=%s", str(record)) self.search_alg.update(record.serialize()) self.dump() if not hasattr( self.search_alg, '_remove_watched_var') or self.search_alg._remove_watched_var: ReportServer.remove_watched_var(step_name, worker_id) logging.info("Update Success. step_name=%s, worker_id=%s, desc=%s", step_name, worker_id, record.desc) logging.info("Best values: %s", ReportServer().print_best(step_name=General.step_name))
def do(self): """Start to run benchmark evaluator.""" logger.info("BenchmarkPipeStep started...") records = self._get_current_step_records() if not records: logger.error("There is no model to evaluate.") return self.master = create_master() for record in records: _record = ReportRecord(worker_id=record.worker_id, desc=record.desc, step_name=record.step_name) ReportClient().broadcast(_record) ReportServer().add_watched_var(record.step_name, record.worker_id) self._evaluate_single_model(record) self.master.join() ReportServer().output_step_all_records(step_name=General.step_name) self.master.close_client() ReportServer().backup_output_path()
def search(self): """Search a sample.""" pareto_records = ReportServer().get_pareto_front_records( choice='normal') best_record = pareto_records[0] if pareto_records else None desc = self.search_space.sample() if best_record: desc['network.neck.code'] = self._mutate_parallelnet( best_record.desc.get("neck").get('code')) self.sample_count += 1 logging.info("desc:{}".format(desc)) return dict(worker_id=self.sample_count, encoded_desc=desc)
def run(self): """Execute the whole pipeline.""" def _shutdown_cluster(signum, frame): logging.info("Shutdown urgently.") shutdown_cluster() os._exit(0) steps_time = [] error_occured = False try: signal.signal(signal.SIGINT, _shutdown_cluster) signal.signal(signal.SIGTERM, _shutdown_cluster) _ = ReportServer() for step_name in PipelineConfig.steps: step_cfg = UserConfig().data.get(step_name) General.step_name = step_name PipeStepConfig.renew() PipeStepConfig.from_dict(step_cfg, skip_check=False) self._set_evaluator_config(step_cfg) logging.info("-" * 48) logging.info(" Step: {}".format(step_name)) logging.info("-" * 48) logger.debug("Pipe step config: {}".format(PipeStepConfig())) if PipeStepConfig.type == "SearchPipeStep": General._parallel = General.parallel_search if PipeStepConfig.type == "TrainPipeStep": General._parallel = General.parallel_fully_train start_time = datetime.datetime.now() PipeStep().do() end_time = datetime.datetime.now() steps_time.append([step_name, start_time, end_time, self._interval_time(start_time, end_time)]) except Exception: logger.error("Failed to run pipeline.") logger.error(traceback.format_exc()) error_occured = True try: shutdown_cluster() except Exception: logger.error("Failed to shutdown dask cluster.") logger.error(traceback.format_exc()) if not error_occured: self._show_pipeline_info(steps_time, step_name)
def search(self): """Search one NetworkDesc from search space. :return: search id, network desc :rtype: int, NetworkDesc """ if self.sample_count < self.random_models: self.sample_count += 1 return dict(worker_id=self.sample_count, encoded_desc=self.search_space.sample()) records = ReportServer().get_pareto_front_records( self.step_name, self.num_individual) if not records: return None codes = [] each_codes_cache = {} # Merge codes for record in records: each_code = [] for key, item in record.desc.get('props').items(): each_codes_cache[key] = len(item) each_code.extend(item) codes.append(each_code) self.length = len(codes[0]) logging.info("codes sum={}, code length={}".format( sum(codes[0]), self.length)) if len(codes) < 2: encoding1, encoding2 = codes[0], codes[0] else: encoding1, encoding2 = random.sample(codes, 2) choice = random.randint(0, 1) # mutate if choice == 0: encoding_new = self.mutatation(encoding1) # crossover else: encoding_new, _ = self.crossover(encoding1, encoding2) # split codes desc = {} for _name, _size in each_codes_cache.items(): desc["network.props.{}".format(_name)] = encoding_new[:_size] encoding_new = encoding_new[_size:] self.sample_count += 1 return dict(worker_id=self.sample_count, encoded_desc=desc)
def search(self): """Search code of a model.""" desc = deepcopy(self.search_space) search_desc = self.search_space.custom if self.sample_count < self.max_sample_random: encoding = self.random.search() else: records = ReportServer().get_pareto_front_records(['nas']) if len(records) == 0: encoding = self.random.search() print('pareto_front_records is None, do random search') else: encoding = self.mutate.search() search_desc['encoding'] = deepcopy(encoding) encoding[0] = self.codec.decode(encoding[0]) search_desc['config'] = encoding self.sample_count += 1 desc['custom'] = search_desc return self.sample_count, desc
def _get_current_step_records(self): step_name = self.task.step_name models_folder = PipeStepConfig.pipe_step.get("models_folder") cur_index = PipelineConfig.steps.index(step_name) if cur_index >= 1 or models_folder: # records = ReportServer().get_pareto_front_records(PipelineConfig.steps[cur_index - 1]) if not models_folder: models_folder = FileOps.join_path( TaskOps().local_output_path, PipelineConfig.steps[cur_index - 1]) models_folder = models_folder.replace("{local_base_path}", TaskOps().local_base_path) records = ReportServer().load_records_from_model_folder( models_folder) else: records = [ReportRecord(step_name, 0)] logging.debug("Records: {}".format(records)) for record in records: record.step_name = step_name return records
def __init__(self, update_func=None): """Init master attrs, setup and start dask distributed cluster and local multiprocess pool.""" self.cfg = General() self.task_count = 0 self.eval_count = General.worker.eval_count self.dask_env = DaskEnv(General.env, self.__master_path__, General.devices_per_trainer, TaskOps().temp_path) status = self.dask_env.start() if not status or not self.dask_env.is_master: sys.exit(0) self._start_cluster() self.t_queue = Queue() self.update_func = update_func self.evaluator_list = {} self._thread_runing = True self._lock = Lock() self._thread = self._run_monitor_thread() ReportServer().renew() return
def __init__(self, search_space=None, **kwargs): """Init DnetNas.""" super(DnetNas, self).__init__(search_space, **kwargs) # ea or random self.num_mutate = self.config.policy.num_mutate self.random_ratio = self.config.policy.random_ratio self.max_sample = self.config.range.max_sample self.min_sample = self.config.range.min_sample self.sample_count = 0 logging.info("inited DnetNas") self.pareto_front = ParetoFront(self.config.pareto.object_count, self.config.pareto.max_object_ids) self._best_desc_file = 'nas_model_desc.json' block_nas_folder = ModelConfig.models_folder.format( local_base_path=self.local_base_path) logging.info(f'folder: {block_nas_folder}') base_reports = ReportServer().load_records_from_model_folder( block_nas_folder) logging.info(f'base_reports: {base_reports}') self.base_block = base_reports[0].desc['backbone']['encoding'].split( '_')[0]