def exec_batch(cmd, numb_node, work_thread, numb_gpu, task_dirs, task_args=None, time_limit="24:0:0", mem_limit=32, modules=None, sources=None): cwd = os.getcwd() job_list = [] fin_tag = 'tag_finished' for ii, mydir in enumerate(task_dirs): os.chdir(mydir) myarg = None if task_args is not None: myarg = task_args[ii] with open('_sub', 'w') as fp: fp.write( make_slurm_script(cmd, numb_node, work_thread, numb_gpu, myarg, time_limit, mem_limit, modules, sources, fin_tag)) job = SlurmJob(os.getcwd(), '_sub', job_finish_tag=fin_tag) job_list.append(job) os.chdir(cwd) for ii in job_list: ii.submit() # time.sleep(1) while True: find_unfinish = False for job in job_list: stat = job.check_status() if stat == JobStatus.terminated: raise RuntimeError("find terminated job") old_job_id = job.get_job_id() new_job_id = job.submit() find_unfinish = True if stat != JobStatus.finished: find_unfinish = True if find_unfinish == False: return else: time.sleep(10)
def exec_batch_group(cmd, work_thread, numb_gpu, task_dirs_, group_size=10, task_args=None, time_limit="24:0:0", mem_limit=6, modules=None, sources=None): cwd = os.getcwd() job_list = [] fin_tag = 'tag_finished' os.chdir(task_dirs_[0]) os.chdir('..') working_dir = os.getcwd() os.chdir(cwd) task_dirs = [] for ii in task_dirs_: task_dirs.append(os.path.abspath(ii)) if task_args is not None: assert (len(task_dirs) == len(task_args)) if task_args is None: task_args = [] for ii in task_dirs: task_args.append("") ntasks = len(task_dirs) task_chunks = [ task_dirs[i:i + group_size] for i in range(0, ntasks, group_size) ] args_chunks = [ task_args[i:i + group_size] for i in range(0, ntasks, group_size) ] os.chdir(working_dir) for ii in range(len(task_chunks)): group_dir = "group.%06d" % ii if not os.path.isdir(group_dir): os.mkdir(group_dir) os.chdir(group_dir) with open('sub', 'w') as fp: fp.write( make_slurm_script_group(cmd, task_chunks[ii], work_thread, numb_gpu, args_chunks[ii], time_limit, mem_limit, modules, sources, fin_tag)) job = SlurmJob(os.getcwd(), 'sub', job_finish_tag=fin_tag) job_list.append(job) os.chdir(working_dir) os.chdir(cwd) # for ii,mydir in enumerate(task_dirs) : # os.chdir(mydir) # myarg = None # if task_args is not None : # myarg = task_args[ii] # with open('_sub', 'w') as fp : # fp.write(make_slurm_script(cmd, work_thread, numb_gpu, myarg, time_limit, mem_limit, modules, sources, fin_tag)) # job = SlurmJob(os.getcwd(), '_sub', job_finish_tag = fin_tag) # job_list.append (job) # os.chdir(cwd) for ii in job_list: ii.submit() # time.sleep(1) while True: find_unfinish = False for job in job_list: stat = job.check_status() if stat == JobStatus.terminated: raise RuntimeError("find terminated job") old_job_id = job.get_job_id() new_job_id = job.submit() find_unfinish = True if stat != JobStatus.finished: find_unfinish = True if find_unfinish == False: return else: time.sleep(10)