def _subp_confirm_train(eva, network_item, pre_blk, gpuq): ngpu = gpuq.get() os.environ['CUDA_VISIBLE_DEVICES'] = str(ngpu) _epoch_ctrl(eva, stage="confirm") score = eva.evaluate(network_item, pre_blk, is_bestNN=True, update_pre_weight=True) gpuq.put(ngpu) return score
def _confirm_train(eva, com, best_nn, best_index, ds, process_pl): NAS_LOG << "confirm_train" tmp = best_nn.item_list[best_index] network_item = NetworkItem( len(best_nn.item_list) + 1, tmp.graph, tmp.cell_list, tmp.code) ds.control(stage="confirm") _epoch_ctrl(eva, stage="confirm") score = process_pl.apply( _subp_confirm_train, (eva, network_item, Network.pre_block, com.idle_gpuq)) network_item.score = score best_nn.item_list.append(network_item) return network_item
def _game(eva, net_pool, ds, round): time_cnt = TimeCnt() start_round = time_cnt.start() block_id = len(net_pool[0].pre_block) NAS_LOG << ('nas_round_start', block_id + 1, round, start_round) cur_data_size = ds.control(stage="game") cur_epoch = _epoch_ctrl(eva, stage="game") if round > 1: round_template = copy.deepcopy( Stage_Info['blk_info'][block_id]['round_info'][0]) Stage_Info['blk_info'][block_id]['round_info'].append(round_template) Stage_Info['blk_info'][block_id]['round_info'][-1][ 'round_start'] = start_round Stage_Info['blk_info'][block_id]['round_info'][-1][ 'round_data_size'] = cur_data_size Stage_Info['blk_info'][block_id]['search_epoch'] = cur_epoch if round > 1: _sample(net_pool) task_list = _spl_info_to_tasks(net_pool, round, cur_epoch, cur_data_size) result = _eva_net(task_list, eva) _record_result(net_pool, result) _eliminate(net_pool, round) _update(net_pool) end_round = time_cnt.stop() NAS_LOG << ('nas_round_over', end_round) Stage_Info['blk_info'][block_id]['round_info'][-1][ 'round_cost'] = end_round
def _train_winner(eva, net_pl, com, ds, pro_pl, round): """ Args: net_pool: list of NetworkUnit, and its length equals to 1 round: the round number of game Returns: best_nn: object of Class NetworkUnit """ NAS_LOG << "config_ops_ing" start_train_winner = time.time() ds.control(stage="game") _epoch_ctrl(eva, stage="game") if MAIN_CONFIG['pattern'] == "Block": _assign_task(net_pl, com, round, batch_num=MAIN_CONFIG['num_gpu'], block_winner=True) com.net_pool = net_pl com.tw_count = NAS_CONFIG['nas_main']['num_opt_best'] - NAS_CONFIG[ 'nas_main']['num_gpu'] _do_task(pro_pl, com, eva) #_arrange_result(com, net_pl) elif MAIN_CONFIG['pattern'] == "Global": _global_train(net_pl, com, pro_pl, eva) best_nn = net_pl[0] _save_net_info(best_nn, len(Network.pre_block) + 1, round, len(net_pl), best_nn.id, len(best_nn.item_list)) scores = [ x.score for x in best_nn.item_list[-MAIN_CONFIG['num_opt_best']:] ] best_index = scores.index(max(scores)) - len(scores) if MAIN_CONFIG['pattern'] == "Block": network_item = _confirm_train(eva, com, best_nn, best_index, ds, pro_pl) _rm_other_model(network_item) else: network_item = best_nn.item_list[best_index] NAS_LOG << ("train_winner_tem", time.time() - start_train_winner) return network_item
def _retrain(eva, ds): time_cnt = TimeCnt() start_time = time_cnt.start() NAS_LOG << ('nas_retrain', start_time) cur_epoch = _epoch_ctrl(stage="retrain") cur_data_size = ds.control(stage="retrain") task_item = EvaScheduleItem(nn_id=-1, alig_id=-1, graph_template=[], item=None,\ pre_blk=Network.pre_block, ft_sign=True, bestNN=True, rd=0, nn_left=-1,\ spl_batch_num=-1, epoch=cur_epoch, data_size=cur_data_size) task_list = [task_item] TSche.load_tasks(task_list) TSche.exec_task(_subproc_eva, eva) result = TSche.get_result() retrain_score = result[0].score retrain_end = time_cnt.stop() NAS_LOG << ('nas_retrain_end', retrain_end, retrain_score) Stage_Info['retrain_start'] = start_time Stage_Info['retrain_cost'] = retrain_end Stage_Info['retrain_epoch'] = cur_epoch Stage_Info['retrain_data_size'] = cur_data_size
def _confirm_train(eva, best_nn, best_index, ds): time_cnt = TimeCnt() start_confirm = time_cnt.start() pre_blk = best_nn.pre_block blk_id = len(pre_blk) NAS_LOG << ("nas_confirm_train", blk_id + 1, start_confirm) cur_data_size = ds.control(stage="confirm") cur_epoch = _epoch_ctrl(eva, stage="confirm") Stage_Info['blk_info'][blk_id]['confirm_train_start'] = start_confirm Stage_Info['blk_info'][blk_id]['confirm_epoch'] = cur_epoch Stage_Info['blk_info'][blk_id]['confirm_data_size'] = cur_data_size nn_id = best_nn.id alig_id = 0 graph_template = best_nn.graph_template item = best_nn.item_list[best_index] network_item = NetworkItem(len(best_nn.item_list), item.graph, item.cell_list, item.code) task_list = [EvaScheduleItem(nn_id, alig_id, graph_template, network_item,\ pre_blk, ft_sign=True, bestNN=True, rd=-1, nn_left=0, spl_batch_num=1,\ epoch=cur_epoch, data_size=cur_data_size)] if MAIN_CONFIG['subp_eva_debug']: result = [] for task_item in task_list: task_item = _subproc_eva(task_item, None, None, eva) result.append(task_item) else: TSche.load_tasks(task_list) TSche.exec_task(_subproc_eva, eva) result = TSche.get_result() network_item.score = result[0].score network_item.task_info = result[0] best_nn.item_list.append(network_item) end_confirm = time_cnt.stop() NAS_LOG << ("nas_confirm_train_fin", end_confirm) Stage_Info['blk_info'][blk_id]['confirm_trian_cost'] = end_confirm return network_item
def _retrain(eva, com, process_pool): _epoch_ctrl(eva, stage="retrain") score = process_pool.apply(_subp_retrain, (eva, Network.pre_block, com.idle_gpuq)) return score
def _game(eva, net_pool, com, ds, round, process_pool): _assign_task(net_pool, com, round) ds.control(stage="game") _epoch_ctrl(eva, stage="game") _do_task(process_pool, com, eva) _arrange_result(com, net_pool)
def _train_winner(eva, net_pl, ds, round, spl_num=MAIN_CONFIG['num_opt_best']): """ Args: net_pool: list of NetworkUnit, and its length equals to 1 round: the round number of game Returns: best_nn: object of Class NetworkUnit """ time_cnt = TimeCnt() start_train_winner = time_cnt.start() blk_id = len(net_pl[0].pre_block) NAS_LOG << ("nas_train_winner_start", blk_id + 1, round, start_train_winner) cur_data_size = ds.control(stage="game") cur_epoch = _epoch_ctrl(eva, stage="game") Stage_Info["blk_info"][blk_id]["train_winner_start"] = start_train_winner Stage_Info["blk_info"][blk_id]["train_winner_data_size"] = cur_data_size i = 0 initial = True while i < spl_num: if initial: batch_num = MAIN_CONFIG['num_gpu'] _sample(net_pl, batch_num=batch_num) task_list = _spl_info_to_tasks(net_pl, round, cur_epoch, cur_data_size, batch_num=batch_num) result = _eva_net(task_list, eva, async_exec=True) newly_added_id = _record_result(net_pl, result) initial = False i += batch_num else: newly_num = len(newly_added_id) newly_num = newly_num if i + newly_num < spl_num else spl_num - i _update(net_pl, newly_added_id=newly_added_id) base_alig_id = [idx[0] for idx in newly_added_id] base_item_id = _sample(net_pl, batch_num=1, base_alig_id=base_alig_id) async_exec = True if i + newly_num < spl_num else False task_list = _spl_info_to_tasks(net_pl, round, cur_epoch, cur_data_size, base_item_id=base_item_id) result = _eva_net(task_list, eva, async_exec=async_exec) newly_added_id = _record_result(net_pl, result) i += newly_num best_nn = net_pl.pop(0) scores = [x.score for x in best_nn.item_list[-spl_num:]] best_index = scores.index(max(scores)) - len(scores) network_item = _confirm_train(eva, best_nn, best_index, ds) _save_net_info(best_nn, round, len(net_pl)) trian_winner_end = time_cnt.stop() NAS_LOG << ("nas_train_winner_tem", trian_winner_end) Stage_Info["blk_info"][blk_id]["train_winner_cost"] = trian_winner_end return network_item