def main(): args = sys.argv[1:] assert len(args) in [4, 5], args master_name, target_name, dir_name, gpu_info = args[:4] info_proc = InfoProc(master_name) # read dir_and_cfgs = info_proc.get_dir_and_cfgs(dir_name) target_dir = info_proc.get_target_dir(target_name) base_dir = info_proc.get_base_dir() # set info_proc.set_target_env(target_name) if is_use_slurm(): info_proc.set_slurm_env() assert dir_and_cfgs, dir_and_cfgs print(f"The target name is: {target_name}, we will cd to {target_dir}") print("Tasks:") for (dirn, cfg) in dir_and_cfgs: print(f" {colored(cfg, 'blue')} => {colored(dirn, 'green')} using {gpu_info} gpus.") if len(args) == 5: wait_last_task(args[4]) start_tasks(dir_and_cfgs, base_dir, target_dir, gpu_info)
def add_user_gpus(self, node_users, users_gpus, all_gpu_num=None): def _add_gpu_list(user, gpu_li): if isinstance(all_gpu_num, int) and len( set(gpu_li)) == all_gpu_num: return user + "(all)" return user + f"({','.join(gpu_li)})" new_node_users = [] for user in node_users: if users_gpus: for full_user, gpu_list in users_gpus.items(): if full_user.startswith(user): user = _add_gpu_list(user, gpu_list) break new_node_users.append(user) if users_gpus: for full_user, gpu_list in users_gpus.items(): match = False for user in node_users: if full_user.startswith(user): match = True break if not match: full_user = _add_gpu_list(full_user, gpu_list) col = 'red' if is_use_slurm() else 'white' new_node_users.append(colored(full_user, col)) return new_node_users
def build_data(self, all_nodes): prog_data = [] summary = dict(node_l3=[], node_l2=[], node_l1=[], node_l0=[]) for name in all_nodes: prog_info, suggest_level = self.single_node(name) prog_data.append(prog_info) summary['node_l{}'.format(suggest_level)].append(name) if is_use_slurm(): prog_data = self.add_slurm_info(all_nodes, prog_data) else: prog_data = self.add_user_info(all_nodes, prog_data) return prog_data, summary
def start_tasks(dir_and_cfgs, base_dir, target_dir, gpu_info): for (dirn, cfg) in dir_and_cfgs: if is_use_slurm(): do_job(base_dir, target_dir, cfg, dirn, gpu_info, shell='slurm') else: while True: gpu_free_id, _ = get_free_gpu(thre=0.9) gpu_list = gpu_info.split(',') if set(gpu_list).issubset(set(gpu_free_id)): do_job(base_dir, target_dir, cfg, dirn, gpu_list, shell='dist') else: wait_for = sorted(list(set(gpu_list).difference( set(gpu_list).intersection(gpu_free_id)))) print(f"Waiting for gpus: {wait_for}") time.sleep(10) time.sleep(5)
def init_headers(self): if is_use_slurm(): return ['parti', 'node', 'gpu', 'cpu', 'mem'] + \ ['GPU{}'.format(i) for i in range(8)] + ['users'] return ['node'] + ['GPU{}'.format(i) for i in range(8)] + ['users']