コード例 #1
0
def main():
    args = sys.argv[1:]
    assert len(args) in [4, 5], args

    master_name, target_name, dir_name, gpu_info = args[:4]

    info_proc = InfoProc(master_name)
    # read
    dir_and_cfgs = info_proc.get_dir_and_cfgs(dir_name)
    target_dir = info_proc.get_target_dir(target_name)
    base_dir = info_proc.get_base_dir()
    # set
    info_proc.set_target_env(target_name)
    if is_use_slurm():
        info_proc.set_slurm_env()
    assert dir_and_cfgs, dir_and_cfgs

    print(f"The target name is: {target_name}, we will cd to {target_dir}")
    print("Tasks:")
    for (dirn, cfg) in dir_and_cfgs:
        print(f"  {colored(cfg, 'blue')} => {colored(dirn, 'green')} using {gpu_info} gpus.")

    if len(args) == 5:
        wait_last_task(args[4])

    start_tasks(dir_and_cfgs, base_dir, target_dir, gpu_info)
コード例 #2
0
    def add_user_gpus(self, node_users, users_gpus, all_gpu_num=None):
        def _add_gpu_list(user, gpu_li):
            if isinstance(all_gpu_num, int) and len(
                    set(gpu_li)) == all_gpu_num:
                return user + "(all)"
            return user + f"({','.join(gpu_li)})"

        new_node_users = []
        for user in node_users:
            if users_gpus:
                for full_user, gpu_list in users_gpus.items():
                    if full_user.startswith(user):
                        user = _add_gpu_list(user, gpu_list)
                        break
            new_node_users.append(user)

        if users_gpus:
            for full_user, gpu_list in users_gpus.items():
                match = False
                for user in node_users:
                    if full_user.startswith(user):
                        match = True
                        break
                if not match:
                    full_user = _add_gpu_list(full_user, gpu_list)
                    col = 'red' if is_use_slurm() else 'white'
                    new_node_users.append(colored(full_user, col))

        return new_node_users
コード例 #3
0
    def build_data(self, all_nodes):
        prog_data = []
        summary = dict(node_l3=[], node_l2=[], node_l1=[], node_l0=[])

        for name in all_nodes:
            prog_info, suggest_level = self.single_node(name)
            prog_data.append(prog_info)
            summary['node_l{}'.format(suggest_level)].append(name)

        if is_use_slurm():
            prog_data = self.add_slurm_info(all_nodes, prog_data)
        else:
            prog_data = self.add_user_info(all_nodes, prog_data)
        return prog_data, summary
コード例 #4
0
def start_tasks(dir_and_cfgs, base_dir, target_dir, gpu_info):
    for (dirn, cfg) in dir_and_cfgs:
        if is_use_slurm():
            do_job(base_dir, target_dir, cfg, dirn, gpu_info, shell='slurm')
        else:
            while True:
                gpu_free_id, _ = get_free_gpu(thre=0.9)
                gpu_list = gpu_info.split(',')
                if set(gpu_list).issubset(set(gpu_free_id)):
                    do_job(base_dir, target_dir, cfg, dirn, gpu_list, shell='dist')
                else:
                    wait_for = sorted(list(set(gpu_list).difference(
                        set(gpu_list).intersection(gpu_free_id))))
                    print(f"Waiting for gpus: {wait_for}")
                time.sleep(10)
        time.sleep(5)
コード例 #5
0
 def init_headers(self):
     if is_use_slurm():
         return ['parti', 'node', 'gpu', 'cpu', 'mem'] + \
                ['GPU{}'.format(i) for i in range(8)] + ['users']
     return ['node'] + ['GPU{}'.format(i) for i in range(8)] + ['users']