def run_distributed(): size = config.NB_PROCESSES processes = [] for rank in range(size): p = Process(target=init_process, args=(rank, size, train_model)) p.start() processes.append(p) while all(p.is_alive() for p in processes): time.sleep(5) for p in processes: p.kill() p.join() logging.info("Main process exit")
def _launch_procs(self, num_procs): mp.set_start_method('forkserver', force=True) skip_msg = mp.Queue( ) # Allows forked processes to share pytest.skip reason processes = [] for local_rank in range(num_procs): p = Process(target=self._dist_init, args=(local_rank, num_procs, skip_msg)) p.start() processes.append(p) # Now loop and wait for a test to complete. The spin-wait here isn't a big # deal because the number of processes will be O(#GPUs) << O(#CPUs). any_done = False while not any_done: for p in processes: if not p.is_alive(): any_done = True break # Wait for all other processes to complete for p in processes: p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT) failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0] for rank, p in failed: # If it still hasn't terminated, kill it because it hung. if p.exitcode is None: p.terminate() pytest.fail(f'Worker {rank} hung.', pytrace=False) if p.exitcode < 0: pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}', pytrace=False) if p.exitcode > 0: pytest.fail(f'Worker {rank} exited with code {p.exitcode}', pytrace=False) if not skip_msg.empty(): # This assumed all skip messages are the same, it may be useful to # add a check here to assert all exit messages are equal pytest.skip(skip_msg.get())
def dist_launcher(num_procs, *func_args, **func_kwargs): """Launch processes and gracefully handle failures.""" # Spawn all workers on subprocesses. processes = [] for local_rank in range(num_procs): p = Process( target=dist_init, args=(local_rank, num_procs, *func_args), kwargs=func_kwargs, ) p.start() processes.append(p) # Now loop and wait for a test to complete. The spin-wait here isn't a big # deal because the number of processes will be O(#GPUs) << O(#CPUs). any_done = False while not any_done: for p in processes: if not p.is_alive(): any_done = True break # Wait for all other processes to complete for p in processes: p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT) failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0] for rank, p in failed: # If it still hasn't terminated, kill it because it hung. if p.exitcode is None: p.terminate() pytest.fail(f"Worker {rank} hung.", pytrace=False) if p.exitcode < 0: pytest.fail( f"Worker {rank} killed by signal {-p.exitcode}", pytrace=False) if p.exitcode > 0: pytest.fail(f"Worker {rank} exited with code {p.exitcode}", pytrace=False)
class DataLoaderMultiFiles(object): """DataLoader to iterator over a set of DataSet""" def __init__(self, dataset, batch_s): self.dataset = dataset self.batch_size = batch_s self.index_queue = deque(torch.randperm(len(self.dataset)).tolist()) self.batch_queue = Queue(maxsize=5) def __iter__(self): print('new iteration of dataloader') args = (self.batch_queue, self.index_queue, self.dataset, self.batch_size) self.batch_process = Process(target=fill_batch, args=args) self.batch_process.daemon = True self.batch_process.start() return self def is_alive(self): # return sum([e.is_alive() for e in self.buffr_processes]) return self.batch_process.is_alive() def __next__(self): # print('batch_queue: {}'.format(self.batch_queue.qsize())) timeout = 600 if self.is_alive() else 1 try: batch = self.batch_queue.get(timeout=timeout) except Empty: print('empty') self.kill() raise StopIteration # print('got batch') tmp = LongTensor(batch) # print('computing') return tmp def kill(self): print('Killing processes') self.batch_process.terminate() def __del__(self): self.kill()
def train(pv_net,dev_train_nums=[0,],dev_bench_num=1): import torch.optim as optim import gc data_rounds=64 #64 data_timeout=30 #96 data_timerest=10 #20 loss2_weight=0.03 train_mcts_b=0 train_mcts_k=2 review_number=3 age_in_epoch=3 log("loss2_weight: %.2f, data_rounds: %dx%d, train_mcts_b: %d, train_mcts_k: %.1f, review_number: %d, age_in_epoch: %d" %(loss2_weight,len(dev_train_nums),data_rounds,train_mcts_b,train_mcts_k,review_number,age_in_epoch)) device_main=torch.device("cuda:0") pv_net=pv_net.to(device_main) optimizer=optim.Adam(pv_net.parameters(),lr=0.0004,betas=(0.9,0.999),eps=1e-07,weight_decay=1e-4,amsgrad=False) log("optimizer: %s"%(optimizer.__dict__['defaults'],)) train_datas=[] p_benchmark=None data_queue=Queue() for epoch in range(4000): if epoch%90==0: save_name='%s-%s-%s-%d.pkl'%(pv_net.__class__.__name__,pv_net.num_layers(),pv_net.num_paras(),epoch) #torch.save(pv_net,save_name) torch.save(pv_net.state_dict(),save_name) if p_benchmark!=None: if p_benchmark.is_alive(): log("waiting benchmark threading to join") p_benchmark.join() p_benchmark=Process(target=benchmark,args=(save_name,epoch,dev_bench_num)) p_benchmark.start() if (epoch<=5) or (epoch<30 and epoch%5==0) or epoch%30==0: output_flag=True log("gc len at %d: %d"%(epoch,len(gc.get_objects()))) else: output_flag=False #start prepare data processes for i in dev_train_nums: args=(copy.deepcopy(pv_net),i,data_rounds,train_mcts_b,train_mcts_k,data_queue) #p=Process(target=prepare_train_data_complete_info,args=args) p=Process(target=clean_worker,args=args) p.start() else: time.sleep(data_timerest) #collect data if epoch>=review_number: train_datas=train_datas[len(train_datas)//review_number:] for i in range(len(dev_train_nums)*4): try: if i==0: queue_get=data_queue.get(block=True,timeout=data_timeout*2+data_timerest) else: queue_get=data_queue.get(block=True,timeout=data_timerest) train_datas+=queue_get except: log("get data failed AGAIN at epoch %d! Has got %d datas."%(epoch,len(train_datas)),l=2) trainloader=torch.utils.data.DataLoader(train_datas,batch_size=128,drop_last=True,shuffle=True) for age in range(age_in_epoch): running_loss1=[];running_loss2=[] for batch in trainloader: p,v=pv_net(batch[0].to(device_main)) log_p=F.log_softmax(p*batch[3].to(device_main),dim=1) loss1=F.kl_div(log_p,batch[1].to(device_main),reduction="batchmean") loss2=F.mse_loss(v.view(-1),batch[2].to(device_main),reduction='mean').sqrt() optimizer.zero_grad() loss=loss1+loss2*loss2_weight loss.backward() optimizer.step() running_loss1.append(loss1.item()) running_loss2.append(loss2.item()) batchnum=len(running_loss1) running_loss1=numpy.mean(running_loss1) running_loss2=numpy.mean(running_loss2) if output_flag and age==0: if epoch==0: test_loss1=running_loss1 test_loss2=running_loss2 elif epoch<review_number: test_loss1=running_loss1*(epoch+1)-last_loss1*epoch test_loss2=running_loss2*(epoch+1)-last_loss2*epoch else: test_loss1=running_loss1*3-last_loss1*2 test_loss2=running_loss2*3-last_loss2*2 log("%d: %.3f %.2f %d %d"%(epoch,test_loss1,test_loss2,len(train_datas),batchnum)) if age==age_in_epoch-1: last_loss1=running_loss1 last_loss2=running_loss2 if output_flag: log(" epoch %d age %d: %.3f %.2f"%(epoch,age,running_loss1,running_loss2)) log(p_benchmark) log("waiting benchmark threading to join: %s"%(p_benchmark.is_alive())) p_benchmark.join() log("benchmark threading should have joined: %s"%(p_benchmark.is_alive()))
def mmseg_parallel_predict_main(para_file, trained_model): print( "MMSegmetation prediction using the trained model (run parallel if use multiple GPUs)" ) machine_name = os.uname()[1] start_time = datetime.now() if os.path.isfile(para_file) is False: raise IOError('File %s not exists in current folder: %s' % (para_file, os.getcwd())) expr_name = parameters.get_string_parameters(para_file, 'expr_name') # network_ini = parameters.get_string_parameters(para_file, 'network_setting_ini') # mmseg_repo_dir = parameters.get_directory(network_ini, 'mmseg_repo_dir') # mmseg_code_dir = osp.join(mmseg_repo_dir,'mmseg') # if os.path.isdir(mmseg_code_dir) is False: # raise ValueError('%s does not exist' % mmseg_code_dir) # # set PYTHONPATH to use my modified version of mmseg # if os.getenv('PYTHONPATH'): # os.environ['PYTHONPATH'] = os.getenv('PYTHONPATH') + ':' + mmseg_code_dir # else: # os.environ['PYTHONPATH'] = mmseg_code_dir # print('\nPYTHONPATH is: ',os.getenv('PYTHONPATH')) if trained_model is None: trained_model = os.path.join(expr_name, 'latest.pth') outdir = parameters.get_directory(para_file, 'inf_output_dir') # remove previous results (let user remove this folder manually or in exe.sh folder) io_function.mkdir(outdir) # get name of inference areas multi_inf_regions = parameters.get_string_list_parameters( para_file, 'inference_regions') b_use_multiGPUs = parameters.get_bool_parameters(para_file, 'b_use_multiGPUs') # loop each inference regions sub_tasks = [] for area_idx, area_ini in enumerate(multi_inf_regions): area_name = parameters.get_string_parameters(area_ini, 'area_name') area_remark = parameters.get_string_parameters(area_ini, 'area_remark') area_time = parameters.get_string_parameters(area_ini, 'area_time') inf_image_dir = parameters.get_directory(area_ini, 'inf_image_dir') # it is ok consider a file name as pattern and pass it the following functions to get file list inf_image_or_pattern = parameters.get_string_parameters( area_ini, 'inf_image_or_pattern') inf_img_list = io_function.get_file_list_by_pattern( inf_image_dir, inf_image_or_pattern) img_count = len(inf_img_list) if img_count < 1: raise ValueError( 'No image for inference, please check inf_image_dir and inf_image_or_pattern in %s' % area_ini) area_save_dir = os.path.join( outdir, area_name + '_' + area_remark + '_' + area_time) io_function.mkdir(area_save_dir) # parallel inference images for this area CUDA_VISIBLE_DEVICES = [] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(): CUDA_VISIBLE_DEVICES = [ int(item.strip()) for item in os.environ['CUDA_VISIBLE_DEVICES'].split(',') ] idx = 0 while idx < img_count: if b_use_multiGPUs: # get available GPUs # https://github.com/anderskm/gputil # memory: orders the available GPU device ids by ascending memory usage deviceIDs = GPUtil.getAvailable(order='memory', limit=100, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[]) # only use the one in CUDA_VISIBLE_DEVICES if len(CUDA_VISIBLE_DEVICES) > 0: deviceIDs = [ item for item in deviceIDs if item in CUDA_VISIBLE_DEVICES ] basic.outputlogMessage('on ' + machine_name + ', available GPUs:' + str(deviceIDs) + ', among visible ones:' + str(CUDA_VISIBLE_DEVICES)) else: basic.outputlogMessage('on ' + machine_name + ', available GPUs:' + str(deviceIDs)) if len(deviceIDs) < 1: time.sleep( 60 ) # wait 60 seconds (mmseg need longer time to load models) , then check the available GPUs again continue # set only the first available visible gpuid = deviceIDs[0] basic.outputlogMessage( '%d: predict image %s on GPU %d of %s' % (idx, inf_img_list[idx], gpuid, machine_name)) else: gpuid = None basic.outputlogMessage('%d: predict image %s on %s' % (idx, inf_img_list[idx], machine_name)) # run inference img_save_dir = os.path.join(area_save_dir, 'I%d' % idx) inf_list_file = os.path.join(area_save_dir, '%d.txt' % idx) done_indicator = '%s_done' % inf_list_file if os.path.isfile(done_indicator): basic.outputlogMessage('warning, %s exist, skip prediction' % done_indicator) idx += 1 continue # if it already exist, then skip if os.path.isdir(img_save_dir) and is_file_exist_in_folder( img_save_dir): basic.outputlogMessage( 'folder of %dth image (%s) already exist, ' 'it has been predicted or is being predicted' % (idx, inf_img_list[idx])) idx += 1 continue with open(inf_list_file, 'w') as inf_obj: inf_obj.writelines(inf_img_list[idx] + '\n') sub_process = Process(target=predict_one_image_mmseg, args=(para_file, inf_img_list[idx], img_save_dir, inf_list_file, gpuid, trained_model)) sub_process.start() sub_tasks.append(sub_process) if b_use_multiGPUs is False: # wait until previous one finished while sub_process.is_alive(): time.sleep(1) idx += 1 # wait until predicted image patches exist or exceed 20 minutes time0 = time.time() elapsed_time = time.time() - time0 while elapsed_time < 20 * 60: elapsed_time = time.time() - time0 file_exist = os.path.isdir( img_save_dir) and is_file_exist_in_folder(img_save_dir) if file_exist is True or sub_process.is_alive() is False: break else: time.sleep(1) if sub_process.exitcode is not None and sub_process.exitcode != 0: sys.exit(1) basic.close_remove_completed_process(sub_tasks) # if 'chpc' in machine_name: # time.sleep(60) # wait 60 second on ITSC services # else: # time.sleep(10) # check all the tasks already finished wait_all_finish = 0 while basic.b_all_process_finish(sub_tasks) is False: if wait_all_finish % 100 == 0: basic.outputlogMessage('wait all tasks to finish') time.sleep(1) wait_all_finish += 1 basic.close_remove_completed_process(sub_tasks) end_time = datetime.now() diff_time = end_time - start_time out_str = "%s: time cost of total parallel inference on %s: %d seconds" % ( str(end_time), machine_name, diff_time.seconds) basic.outputlogMessage(out_str) with open("time_cost.txt", 'a') as t_obj: t_obj.writelines(out_str + '\n')