def _game(eva, net_pool, ds, round): time_cnt = TimeCnt() start_round = time_cnt.start() block_id = len(net_pool[0].pre_block) NAS_LOG << ('nas_round_start', block_id + 1, round, start_round) cur_data_size = ds.control(stage="game") cur_epoch = _epoch_ctrl(eva, stage="game") if round > 1: round_template = copy.deepcopy( Stage_Info['blk_info'][block_id]['round_info'][0]) Stage_Info['blk_info'][block_id]['round_info'].append(round_template) Stage_Info['blk_info'][block_id]['round_info'][-1][ 'round_start'] = start_round Stage_Info['blk_info'][block_id]['round_info'][-1][ 'round_data_size'] = cur_data_size Stage_Info['blk_info'][block_id]['search_epoch'] = cur_epoch if round > 1: _sample(net_pool) task_list = _spl_info_to_tasks(net_pool, round, cur_epoch, cur_data_size) result = _eva_net(task_list, eva) _record_result(net_pool, result) _eliminate(net_pool, round) _update(net_pool) end_round = time_cnt.stop() NAS_LOG << ('nas_round_over', end_round) Stage_Info['blk_info'][block_id]['round_info'][-1][ 'round_cost'] = end_round
def _subproc_eva(task_item, result_buffer, signal, eva): NAS_LOG = Logger() task_item.pid = os.getpid() time_cnt = TimeCnt() task_item.start_time = time_cnt.start() NAS_LOG << ('nas_eva_ing', len(task_item.pre_block)+1,\ task_item.round, task_item.nn_id, task_item.network_item.id) if MAIN_CONFIG['eva_mask']: task_item.score = random.uniform(0, 0.1) else: os.environ['CUDA_VISIBLE_DEVICES'] = str(task_item.gpu_info) try: task_item.score = eva.evaluate(task_item) except Exception as error: _err_log(NAS_LOG, task_item, error) task_item.score = 0 task_item.cost_time = time_cnt.stop() NAS_LOG << ('nas_eva_fin', len(task_item.pre_block)+1,\ task_item.round, task_item.nn_id, task_item.network_item.id,\ task_item.score, task_item.cost_time, task_item.pid) # use in subprocess if result_buffer and signal: result_buffer.put(task_item) signal.set() return task_item
def run(self): NAS_LOG << 'nas_enuming' network_pool_tem = self.enu.enumerate() time_search = TimeCnt() start_search = time_search.start() NAS_LOG << ('nas_start_search', start_search) Stage_Info['nas_start'] = start_search for i in range(MAIN_CONFIG["block_num"]): network_item = _search_blk(i, self.eva, self.ds, network_pool_tem) Network.pre_block.append(network_item) end_search = time_search.stop() NAS_LOG << ('nas_search_end', end_search) Stage_Info['nas_cost'] = end_search _dump_stage(Stage_Info) _retrain(self.eva, self.ds) for block in Network.pre_block: NAS_LOG << ('nas_pre_block', str(block.graph), str( block.cell_list)) return Network.pre_block
def _retrain(eva, ds): time_cnt = TimeCnt() start_time = time_cnt.start() NAS_LOG << ('nas_retrain', start_time) cur_epoch = _epoch_ctrl(stage="retrain") cur_data_size = ds.control(stage="retrain") task_item = EvaScheduleItem(nn_id=-1, alig_id=-1, graph_template=[], item=None,\ pre_blk=Network.pre_block, ft_sign=True, bestNN=True, rd=0, nn_left=-1,\ spl_batch_num=-1, epoch=cur_epoch, data_size=cur_data_size) task_list = [task_item] TSche.load_tasks(task_list) TSche.exec_task(_subproc_eva, eva) result = TSche.get_result() retrain_score = result[0].score retrain_end = time_cnt.stop() NAS_LOG << ('nas_retrain_end', retrain_end, retrain_score) Stage_Info['retrain_start'] = start_time Stage_Info['retrain_cost'] = retrain_end Stage_Info['retrain_epoch'] = cur_epoch Stage_Info['retrain_data_size'] = cur_data_size
def _err_log(NAS_LOG, task_item, error): pre_block = [] for block in task_item.pre_block: pre_block.append((block.graph, block.cell_list, block.code)) NAS_LOG << ('err_task_info', TimeCnt().start(), len(pre_block)+1, task_item.nn_id,\ task_item.alig_id, str(pre_block), str(task_item.graph_template)) scheme = task_item.network_item NAS_LOG << ('err_scheme_info', task_item.task_id, task_item.pid, task_item.start_time, \ task_item.cost_time, task_item.gpu_info, task_item.round, task_item.nn_left, \ task_item.spl_batch_num, str(scheme.graph), str(scheme.cell_list), \ str(scheme.code), scheme.score) NAS_LOG << ('err_info', error)
def _confirm_train(eva, best_nn, best_index, ds): time_cnt = TimeCnt() start_confirm = time_cnt.start() pre_blk = best_nn.pre_block blk_id = len(pre_blk) NAS_LOG << ("nas_confirm_train", blk_id + 1, start_confirm) cur_data_size = ds.control(stage="confirm") cur_epoch = _epoch_ctrl(eva, stage="confirm") Stage_Info['blk_info'][blk_id]['confirm_train_start'] = start_confirm Stage_Info['blk_info'][blk_id]['confirm_epoch'] = cur_epoch Stage_Info['blk_info'][blk_id]['confirm_data_size'] = cur_data_size nn_id = best_nn.id alig_id = 0 graph_template = best_nn.graph_template item = best_nn.item_list[best_index] network_item = NetworkItem(len(best_nn.item_list), item.graph, item.cell_list, item.code) task_list = [EvaScheduleItem(nn_id, alig_id, graph_template, network_item,\ pre_blk, ft_sign=True, bestNN=True, rd=-1, nn_left=0, spl_batch_num=1,\ epoch=cur_epoch, data_size=cur_data_size)] if MAIN_CONFIG['subp_eva_debug']: result = [] for task_item in task_list: task_item = _subproc_eva(task_item, None, None, eva) result.append(task_item) else: TSche.load_tasks(task_list) TSche.exec_task(_subproc_eva, eva) result = TSche.get_result() network_item.score = result[0].score network_item.task_info = result[0] best_nn.item_list.append(network_item) end_confirm = time_cnt.stop() NAS_LOG << ("nas_confirm_train_fin", end_confirm) Stage_Info['blk_info'][blk_id]['confirm_trian_cost'] = end_confirm return network_item
def _search_blk(block_id, eva, ds, npool_tem): """evaluate all the networks asynchronously inside one round and synchronously between rounds :param block_id: :param eva: :param npool_tem: :return: """ time_blk = TimeCnt() start_block = time_blk.start() NAS_LOG << ('nas_search_blk', block_id + 1, MAIN_CONFIG["block_num"], start_block) Stage_Info['blk_info'][block_id]['blk_start'] = start_block net_pool = copy.deepcopy(npool_tem) _init_npool_sampler(net_pool, block_id) net_pool = _init_ops(net_pool) round = 0 time_game = TimeCnt() start_game = time_game.start() NAS_LOG << ('nas_rounds_game_start', block_id + 1, start_game) Stage_Info['blk_info'][block_id]['rounds_game_start'] = start_game while len(net_pool) > 1: round += 1 _game(eva, net_pool, ds, round) game_end = time_game.stop() NAS_LOG << ('nas_get_winner', game_end) Stage_Info['blk_info'][block_id]['rounds_game_cost'] = game_end Stage_Info['blk_info'][block_id]['round_num'] = round network_item = _train_winner(eva, net_pool, ds, round + 1) blk_end = time_blk.stop() NAS_LOG << ('nas_search_blk_end', blk_end) Stage_Info['blk_info'][block_id]['blk_cost'] = blk_end return network_item
def _train_winner(eva, net_pl, ds, round, spl_num=MAIN_CONFIG['num_opt_best']): """ Args: net_pool: list of NetworkUnit, and its length equals to 1 round: the round number of game Returns: best_nn: object of Class NetworkUnit """ time_cnt = TimeCnt() start_train_winner = time_cnt.start() blk_id = len(net_pl[0].pre_block) NAS_LOG << ("nas_train_winner_start", blk_id + 1, round, start_train_winner) cur_data_size = ds.control(stage="game") cur_epoch = _epoch_ctrl(eva, stage="game") Stage_Info["blk_info"][blk_id]["train_winner_start"] = start_train_winner Stage_Info["blk_info"][blk_id]["train_winner_data_size"] = cur_data_size i = 0 initial = True while i < spl_num: if initial: batch_num = MAIN_CONFIG['num_gpu'] _sample(net_pl, batch_num=batch_num) task_list = _spl_info_to_tasks(net_pl, round, cur_epoch, cur_data_size, batch_num=batch_num) result = _eva_net(task_list, eva, async_exec=True) newly_added_id = _record_result(net_pl, result) initial = False i += batch_num else: newly_num = len(newly_added_id) newly_num = newly_num if i + newly_num < spl_num else spl_num - i _update(net_pl, newly_added_id=newly_added_id) base_alig_id = [idx[0] for idx in newly_added_id] base_item_id = _sample(net_pl, batch_num=1, base_alig_id=base_alig_id) async_exec = True if i + newly_num < spl_num else False task_list = _spl_info_to_tasks(net_pl, round, cur_epoch, cur_data_size, base_item_id=base_item_id) result = _eva_net(task_list, eva, async_exec=async_exec) newly_added_id = _record_result(net_pl, result) i += newly_num best_nn = net_pl.pop(0) scores = [x.score for x in best_nn.item_list[-spl_num:]] best_index = scores.index(max(scores)) - len(scores) network_item = _confirm_train(eva, best_nn, best_index, ds) _save_net_info(best_nn, round, len(net_pl)) trian_winner_end = time_cnt.stop() NAS_LOG << ("nas_train_winner_tem", trian_winner_end) Stage_Info["blk_info"][blk_id]["train_winner_cost"] = trian_winner_end return network_item