def dis_global(enu, eva, num_gpu): manager = multiprocessing.Manager() gpu_list = manager.Queue() os.remove('1.txt') f1 = open("1.txt", 'a') NETWORK_POOL = enu.enumerate() block_num = 0 for network in NETWORK_POOL: network.spl = Sampler(network.graph_template, 0) Net_item = NetworkItem(0) #with Pool(1) as p: #NETWORK_POOL,Net_item=p.apply(initialize_ops,(NETWORK_POOL,Net_item,)); Net = NETWORK_POOL[0] for gpu in range(0, num_gpu): #not test first_samper gpu_list.put(gpu) Net_item.cell, Net_item.graph, Net_item.code = Net.spl.sample() tmp = copy.deepcopy(Net_item) Net.item_list.append(tmp) pool = Pool(processes=num_gpu) eva.add_data(300) results = [] i = 0 while i < 200: while not gpu_list.empty(): ngpu = gpu_list.get() eva_result = pool.apply_async(run_global, args=( eva, Net.item_list[i], ngpu, gpu_list, )) #print(eva_result,i,ngpu,Net.item_list[i].graph,Net.item_list[i].cell,flush=True); results.append(eva_result) i = i + 1 k = 0 for result in results[i - 2:]: Net.item_list[i - 2 + k].score = result.get() print(i - 2 + k, Net.item_list[i - 2 + k].score, Net.item_list[i - 2 + k].cell, Net.item_list[i - 2 + k].graph, flush=True) Net.spl.update_opt_model(Net.item_list[i - 2 + k].code, -Net.item_list[i - 2 + k].score) Net_item.cell, Net_item.graph, Net_item.code = Net.spl.sample() tmp = copy.deepcopy(Net_item) Net.item_list.append(tmp) save_glolog(f1, Net, i - 2 + k) k = k + 1 pool.close() # pool.join() #
def _confirm_train(eva, com, best_nn, best_index, ds, process_pl): NAS_LOG << "confirm_train" tmp = best_nn.item_list[best_index] network_item = NetworkItem( len(best_nn.item_list) + 1, tmp.graph, tmp.cell_list, tmp.code) ds.control(stage="confirm") _epoch_ctrl(eva, stage="confirm") score = process_pl.apply( _subp_confirm_train, (eva, network_item, Network.pre_block, com.idle_gpuq)) network_item.score = score best_nn.item_list.append(network_item) return network_item
def wake_up_train_winner(self, res): print("train_winner wake up") score, time_cost, nn_id, spl_id = res print("nn_id spl_id item_list_length", nn_id, spl_id, len(self.net_pool[nn_id - 1].item_list)) self.net_pool[nn_id - 1].item_list[spl_id - 1].score = score self.net_pool[nn_id - 1].spl.update_opt_model( self.net_pool[nn_id - 1].item_list[spl_id - 1].code, -self.net_pool[nn_id - 1].item_list[spl_id - 1].score) item_id = len(self.net_pool[nn_id - 1].item_list) + 1 cnt = 0 while cnt < 500: cell, graph, table = self.net_pool[nn_id - 1].spl.sample() if table not in self.tables: self.tables.append(table) print("sample success", cnt) break cnt += 1 if self.tw_count > 0: self.net_pool[nn_id - 1].item_list.append( NetworkItem(item_id, graph, cell, table)) item = self.net_pool[nn_id - 1].item_list[-1] task_param = [ item, self.net_pool[nn_id - 1].pre_block, 3, nn_id, 1, item_id, item_id, True, True ] print("train winner new task put") self.task.put(task_param) self.tw_count -= 1
def _sample_batch(network, batch_num=1, pred=None): """sample with duplicate check """ graphs, cells, tables = [], [], [] use_pred = [] spl_index = 0 cnt = 0 while spl_index < batch_num: cnt += 1 if cnt > 500: NAS_LOG << ('nas_no_dim_spl', spl_index) raise ValueError("sample error") cell, graph, table = network.spl.sample() if pred: graph, cell, table = _pred_ops(network, pred, graph, table) if table not in tables: graphs.append(graph) cells.append(cell) tables.append(table) spl_index += 1 if pred: use_pred = table # record the table which used pred pred = None # only pred one item in the init start_id = len(network.item_list) item_ids = range(start_id, start_id + batch_num) for item_id, graph, cell, table in zip(item_ids, graphs, cells, tables): network.item_list.append( NetworkItem(item_id, graph, cell, table, use_pred == table)) return item_ids
def _gpu_batch_init(nn, pred, batch_num=MAIN_CONFIG['spl_network_round']): """ :param nn: :param batch_num: :return: """ cells, graphs, tables = _init_ops_dup_chk(nn, pred, batch_num) # cells, graphs, tables = _spl_dup_chk(nn, batch_num) for cell, graph, table, spl_id in zip(cells, graphs, tables, range(1, batch_num + 1)): nn.item_list.append(NetworkItem(spl_id, graph, cell, table))
def _gpu_batch_spl(nn, batch_num=MAIN_CONFIG['spl_network_round']): """ :param nn: :param batch_num: :return: """ cells, graphs, tables = _spl_dup_chk(nn, batch_num) item_start_id = len(nn.item_list) + 1 for cell, graph, table, item_id in zip( cells, graphs, tables, range(item_start_id, batch_num + item_start_id)): nn.item_list.append(NetworkItem(item_id, graph, cell, table))
def _confirm_train(eva, best_nn, best_index, ds): time_cnt = TimeCnt() start_confirm = time_cnt.start() pre_blk = best_nn.pre_block blk_id = len(pre_blk) NAS_LOG << ("nas_confirm_train", blk_id + 1, start_confirm) cur_data_size = ds.control(stage="confirm") cur_epoch = _epoch_ctrl(eva, stage="confirm") Stage_Info['blk_info'][blk_id]['confirm_train_start'] = start_confirm Stage_Info['blk_info'][blk_id]['confirm_epoch'] = cur_epoch Stage_Info['blk_info'][blk_id]['confirm_data_size'] = cur_data_size nn_id = best_nn.id alig_id = 0 graph_template = best_nn.graph_template item = best_nn.item_list[best_index] network_item = NetworkItem(len(best_nn.item_list), item.graph, item.cell_list, item.code) task_list = [EvaScheduleItem(nn_id, alig_id, graph_template, network_item,\ pre_blk, ft_sign=True, bestNN=True, rd=-1, nn_left=0, spl_batch_num=1,\ epoch=cur_epoch, data_size=cur_data_size)] if MAIN_CONFIG['subp_eva_debug']: result = [] for task_item in task_list: task_item = _subproc_eva(task_item, None, None, eva) result.append(task_item) else: TSche.load_tasks(task_list) TSche.exec_task(_subproc_eva, eva) result = TSche.get_result() network_item.score = result[0].score network_item.task_info = result[0] best_nn.item_list.append(network_item) end_confirm = time_cnt.stop() NAS_LOG << ("nas_confirm_train_fin", end_confirm) Stage_Info['blk_info'][blk_id]['confirm_trian_cost'] = end_confirm return network_item
graph_full = [[1, 6, 2, 3], [2, 3, 4], [3, 8, 5], [4, 5], [5], [10], [7], [5], [9], [5]] cell_list = [ Cell('conv', 64, 3, 'leakyrelu'), Cell('sep_conv', 32, 3, 'relu'), Cell('conv', 64, 3, 'leakyrelu'), Cell('conv', 32, 3, 'relu'), Cell('conv', 64, 1, 'relu6'), Cell('conv', 48, 3, 'relu'), Cell('sep_conv', 64, 3, 'relu6'), Cell('sep_conv', 32, 1, 'leakyrelu'), Cell('sep_conv', 64, 5, 'leakyrelu'), Cell('conv', 48, 1, 'relu') ] network1 = NetworkItem(0, graph_full, cell_list, "") graph_full = [[1, 6, 7, 2, 3], [2, 3], [3, 4, 5], [4, 5], [5], [9], [3], [8], [3]] cell_list = [ Cell('conv', 128, 3, 'relu6'), Cell('conv', 128, 5, 'leakyrelu'), Cell('sep_conv', 48, 1, 'leakyrelu'), Cell('conv', 128, 3, 'relu'), Cell('conv', 128, 3, 'leakyrelu'), Cell('conv', 64, 3, 'relu'), Cell('sep_conv', 48, 3, 'leakyrelu'), Cell('conv', 128, 3, 'relu'), Cell('conv', 128, 3, 'relu6') ] network2 = NetworkItem(1, graph_full, cell_list, "") graph_full = [[1, 6, 2, 3], [2, 7, 3], [3, 4], [4, 5], [5], [9], [5], [8],
if __name__ == "__main__": from base import Network, NetworkItem, Cell #初始化一个Network Net = Network(0, [[1], [2], [3], []]) cellist = [('conv', 512, 5, 'relu'), ('pooling', 'max', 3), ('pooling', 'max', 2), ] cell_list = [] for x in cellist: if len(x) == 4: cell_list.append(Cell(x[0], x[1], x[2], x[3])) else: cell_list.append(Cell(x[0], x[1], x[2])) #初始化一个NetworkItem item = NetworkItem(0, [[1], [2], [3], []], cell_list, "") print(type(cell_list)) Net.item_list.append(item) print(Net.item_list[0]) print("主进程", Net.item_list[0].cell_list) #测试子进程的cell_list pool = Pool(2) result = pool.apply_async(test_cell, args=(Net.item_list[0],)) pool.close() pool.join()
def _run_module(self, _graph_part, _cell_list): eva = Evaluator() eva.add_data(500) tmp = NetworkItem(0, _graph_part, _cell_list, "") return eva.evaluate(tmp, is_bestNN=True)