Ejemplo n.º 1
0
 def check_server(self, profile):
     try:
         session = SSHSession(profile)
         session.close()
         return True
     except:
         return False
Ejemplo n.º 2
0
 def __init__(self,
              remote_profile,
              context_type='local',
              batch_type='slurm'):
     self.remote_profile = remote_profile
     if context_type == 'local':
         self.session = LocalSession(remote_profile)
         self.context = LocalContext
         self.uuid_names = False
     elif context_type == 'lazy-local':
         self.session = None
         self.context = LazyLocalContext
         self.uuid_names = True
     elif context_type == 'ssh':
         self.session = SSHSession(remote_profile)
         self.context = SSHContext
         self.uuid_names = False
     else:
         raise RuntimeError('unknown context')
     if batch_type == 'slurm':
         self.batch = Slurm
     elif batch_type == 'lsf':
         self.batch = LSF
     elif batch_type == 'pbs':
         self.batch = PBS
     elif batch_type == 'shell':
         self.batch = Shell
     elif batch_type == 'aws':
         self.batch = AWS
     else:
         raise RuntimeError('unknown batch ' + batch_type)
Ejemplo n.º 3
0
def decide_model_devi_machine(mdata):
    if 'model_devi' in mdata:
        continue_flag = False
        if 'record.machine' in os.listdir():
            try:
                with open('record.machine', 'r') as _infile:
                    profile = json.load(_infile)
                    if profile['purpose'] == 'model_devi':
                        mdata['model_devi_machine'] = profile['machine']
                        mdata['model_devi_resources'] = profile['resources']
                        mdata['lmp_command'] = profile['command']
                        mdata['model_devi_group_size'] = profile['group_size']
                        continue_flag = True
            except:
                pass
        if "hostname" not in mdata["model_devi"][0]["machine"]:
            mdata["model_devi_machine"] = mdata["model_devi"][0]["machine"]
            mdata["model_devi_resources"] = mdata["model_devi"][0]["resources"]
            mdata["lmp_command"] = mdata["model_devi"][0]["command"]
            #if "group_size" in mdata["train"][0]:
            mdata["model_devi_group_size"] = mdata["model_devi"][0][
                "group_size"]
            continue_flag = True
        pd_count_list = []
        pd_flag = False
        if not continue_flag:

            #assert isinstance(mdata['model_devi']['machine'], list)
            #ssert isinstance(mdata['model_devi']['resources'], list)
            #assert len(mdata['model_devi']['machine']) == len(mdata['model_devi']['resources'])

            for machine_idx in range(len(mdata['model_devi'])):
                temp_machine = mdata['model_devi'][machine_idx]['machine']
                temp_resources = mdata['model_devi'][machine_idx]['resources']
                #assert isinstance(temp_machine, dict), "unsupported type of model_devi machine [%d]!" %machine_idx
                #assert isinstance(temp_resources, dict), "unsupported type of model_devi resources [%d]!"%machine_idx
                #assert temp_machine['machine_type'] == 'slurm', "Currently only support for Slurm!"
                temp_ssh_sess = SSHSession(temp_machine)
                cwd = os.getcwd()
                temp_context = SSHContext(cwd, temp_ssh_sess)
                temp_batch = Slurm(temp_context)
                command = temp_batch._make_squeue(temp_machine, temp_resources)
                ret, stdin, stdout, stderr = temp_batch.context.block_call(
                    command)
                pd_response = stdout.read().decode('utf-8').split("\n")
                pd_count = len(pd_response)
                temp_context.clean()
                if pd_count == 0:
                    mdata['model_devi_machine'] = temp_machine
                    mdata['model_devi_resources'] = temp_resources
                    mdata['lmp_command'] = mdata['model_devi'][machine_idx][
                        'command']
                    mdata['model_devi_group_size'] = mdata['model_devi'][
                        machine_idx]['group_size']
                    pd_flag = True
                    break
                else:
                    pd_count_list.append(pd_count)
            if not pd_flag:
                min_machine_idx = np.argsort(pd_count_list)[0]
                mdata['model_devi_machine'] = mdata['model_devi'][
                    min_machine_idx]['machine']
                mdata['model_devi_resources'] = mdata['model_devi'][
                    min_machine_idx]['resources']
                mdata['lmp_command'] = mdata['model_devi'][min_machine_idx][
                    'command']
                mdata['model_devi_group_size'] = mdata['model_devi'][
                    min_machine_idx]['group_size']
            with open("record.machine", "w") as _outfile:
                profile = {}
                profile['purpose'] = 'model_devi'
                profile['machine'] = mdata['model_devi_machine']
                profile['resources'] = mdata['model_devi_resources']
                profile['group_size'] = mdata['model_devi_group_size']
                profile['command'] = mdata['lmp_command']

                json.dump(profile, _outfile, indent=4)
    return mdata
Ejemplo n.º 4
0
def decide_fp_machine(mdata):

    if 'fp' in mdata:
        #ssert isinstance(mdata['fp']['machine'], list)
        #assert isinstance(mdata['fp']['resources'], list)
        #assert len(mdata['fp']['machine']) == len(mdata['fp']['resources'])
        continue_flag = False
        ## decide whether to use an existing machine
        if 'record.machine' in os.listdir():
            try:
                with open('record.machine', 'r') as _infile:
                    profile = json.load(_infile)
                    if profile['purpose'] == 'fp':
                        mdata['fp_machine'] = profile['machine']
                        mdata['fp_resources'] = profile['resources']
                        mdata['fp_command'] = profile['command']
                        mdata['fp_group_size'] = profile['group_size']
                        #mdata['deepmd_path'] = profile['deepmd_path']
                        continue_flag = True
            except:
                pass
        if "hostname" not in mdata["fp"][0]["machine"]:
            mdata["fp_machine"] = mdata["fp"][0]["machine"]
            mdata["fp_resources"] = mdata["fp"][0]["resources"]
            mdata["fp_command"] = mdata["fp"][0]["command"]
            #if "group_size" in mdata["train"][0]:
            mdata["fp_group_size"] = mdata["fp"][0]["group_size"]
            continue_flag = True
        pd_count_list = []
        pd_flag = False
        if not continue_flag:
            for machine_idx in range(len(mdata['fp'])):
                temp_machine = mdata['fp'][machine_idx]['machine']
                temp_resources = mdata['fp'][machine_idx]['resources']
                temp_ssh_sess = SSHSession(temp_machine)
                cwd = os.getcwd()
                temp_context = SSHContext(cwd, temp_ssh_sess)
                temp_batch = Slurm(temp_context)
                command = temp_batch._make_squeue(temp_machine, temp_resources)
                ret, stdin, stdout, stderr = temp_batch.context.block_call(
                    command)
                pd_response = stdout.read().decode('utf-8').split("\n")
                pd_count = len(pd_response)
                temp_context.clean()
                #dlog.info(temp_machine["username"] + " " + temp_machine["hostname"] +  " " + str(pd_count))
                if pd_count == 0:
                    mdata['fp_machine'] = temp_machine
                    mdata['fp_resources'] = temp_resources
                    mdata['fp_command'] = mdata['fp'][machine_idx]['command']
                    mdata['fp_group_size'] = mdata['fp'][machine_idx][
                        'group_size']
                    pd_flag = True
                    break
                else:
                    pd_count_list.append(pd_count)
            if not pd_flag:
                min_machine_idx = np.argsort(pd_count_list)[0]
                mdata['fp_machine'] = mdata['fp'][min_machine_idx]['machine']
                mdata['fp_resources'] = mdata['fp'][min_machine_idx][
                    'resources']
                mdata['fp_command'] = mdata['fp'][min_machine_idx]['command']
                mdata['fp_group_size'] = mdata['fp'][min_machine_idx][
                    'group_size']

            with open("record.machine", "w") as _outfile:
                profile = {}
                profile['purpose'] = 'fp'
                profile['machine'] = mdata['fp_machine']
                profile['resources'] = mdata['fp_resources']
                profile['group_size'] = mdata['fp_group_size']
                profile['command'] = mdata['fp_command']
                json.dump(profile, _outfile, indent=4)
#	print("mdata", mdata)
    return mdata
Ejemplo n.º 5
0
def decide_train_machine(mdata):
    if 'train' in mdata:
        continue_flag = False
        ## decide whether to use an existing machine
        if 'record.machine' in os.listdir():
            try:
                with open('record.machine', 'r') as _infile:
                    profile = json.load(_infile)
                    if profile['purpose'] == 'train':
                        mdata['train_machine'] = profile['machine']
                        mdata['train_resources'] = profile['resources']
                        mdata['deepmd_path'] = profile['deepmd_path']
                        if "group_size" in profile:
                            mdata["train_group_size"] = profile["group_size"]
                        continue_flag = True
            except:
                pass
        if "hostname" not in mdata["train"][0]["machine"]:
            mdata["train_machine"] = mdata["train"][0]["machine"]
            mdata["train_resources"] = mdata["train"][0]["resources"]
            mdata["deepmd_path"] = mdata["train"][0]["deepmd_path"]
            if "group_size" in mdata["train"][0]:
                mdata["train_group_size"] = mdata["train"][0]["group_size"]
            continue_flag = True

        pd_flag = False
        pd_count_list = []
        # pd for pending job in slurm
        # if we need to launch new machine_idxines
        if not continue_flag:

            #assert isinstance(mdata['train']['machine'], list)
            #assert isinstance(mdata['train']['resources'], list)
            #assert len(mdata['train']['machine']) == len(mdata['train']['resources'])
            # mdata['train'] is  a list
            for machine_idx in range(len(mdata['train'])):
                temp_machine = mdata['train'][machine_idx]['machine']
                temp_resources = mdata['train'][machine_idx]['resources']
                temp_ssh_sess = SSHSession(temp_machine)
                cwd = os.getcwd()
                temp_context = SSHContext(cwd, temp_ssh_sess)
                temp_batch = Slurm(temp_context)
                command = temp_batch._make_squeue(temp_machine, temp_resources)
                ret, stdin, stdout, stderr = temp_batch.context.block_call(
                    command)
                pd_response = stdout.read().decode('utf-8').split("\n")
                pd_count = len(pd_response)
                temp_context.clean()
                ## If there is no need to waiting for allocation
                if pd_count == 1:
                    mdata['train_machine'] = temp_machine
                    mdata['train_resources'] = temp_resources
                    mdata['deepmd_path'] = mdata['train'][machine_idx][
                        'deepmd_path']
                    ## No need to wait
                    pd_flag = True
                    break
                else:
                    pd_count_list.append(pd_count)
            if not pd_flag:
                ## All machines need waiting, then compare waiting jobs
                ## Select a machine which has fewest waiting jobs
                min_machine_idx = np.argsort(pd_count_list)[0]
                mdata['train_machine'] = mdata['train'][min_machine_idx][
                    'machine']
                mdata['train_resources'] = mdata['train'][min_machine_idx][
                    'resources']
                mdata['deepmd_path'] = mdata['train'][min_machine_idx][
                    'deepmd_path']
                if "group_size" in mdata['train'][min_machine_idx]:
                    mdata["train_group_size"] = mdata['train'][
                        min_machine_idx]["group_size"]

## Record which machine is selected
            with open("record.machine", "w") as _outfile:
                profile = {}
                profile['purpose'] = 'train'
                profile['machine'] = mdata['train_machine']
                profile['resources'] = mdata['train_resources']
                profile['deepmd_path'] = mdata['deepmd_path']
                if "train_group_size" in mdata:
                    profile["group_size"] = mdata["train_group_size"]

                json.dump(profile, _outfile, indent=4)
    return mdata
Ejemplo n.º 6
0
def decide_fp_machine(mdata):
    if LooseVersion(mdata.get('api_version', '0.9')) >= LooseVersion('1.0'):
        mdata['fp_group_size'] = mdata['fp'][0]['resources']['group_size']
    if 'fp' in mdata:
        #ssert isinstance(mdata['fp']['machine'], list)
        #assert isinstance(mdata['fp']['resources'], list)
        #assert len(mdata['fp']['machine']) == len(mdata['fp']['resources'])
        continue_flag = False
        ## decide whether to use an existing machine
        if 'record.machine' in os.listdir():
            try:
                with open('record.machine', 'r') as _infile:
                    profile = json.load(_infile)
                    if profile['purpose'] == 'fp':
                        mdata['fp_machine'] = profile['machine']
                        mdata['fp_resources'] = profile['resources']
                        mdata['fp_command'] = profile['command']
                        mdata['fp_group_size'] = profile['group_size']

                        continue_flag = True
            except:
                pass
        if ("hostname" not in mdata["fp"][0]["machine"]) or (len(mdata["fp"])
                                                             == 1):
            mdata["fp_machine"] = mdata["fp"][0]["machine"]
            mdata["fp_resources"] = mdata["fp"][0]["resources"]
            mdata["fp_command"] = mdata["fp"][0]["command"]
            #if "group_size" in mdata["train"][0]:
            mdata["fp_group_size"] = mdata["fp"][0].get("group_size", 1)
            continue_flag = True

        pd_count_list = []
        pd_flag = False
        if not continue_flag:
            for machine_idx in range(len(mdata['fp'])):
                temp_machine = mdata['fp'][machine_idx]['machine']
                temp_resources = mdata['fp'][machine_idx]['resources']
                temp_ssh_sess = SSHSession(temp_machine)
                cwd = os.getcwd()
                temp_context = SSHContext(cwd, temp_ssh_sess)
                if temp_machine['machine_type'] == 'lsf':
                    temp_batch = LSF(temp_context)
                else:
                    temp_batch = Slurm(temp_context)
                # For other type of machines, please add them using 'elif'.
                # Here slurm is selected as the final choice in convinience.
                command = temp_batch._make_squeue(temp_machine, temp_resources)
                ret, stdin, stdout, stderr = temp_batch.context.block_call(
                    command)
                pd_response = stdout.read().decode('utf-8').split("\n")
                pd_count = len(pd_response)
                temp_context.clean()
                #dlog.info(temp_machine["username"] + " " + temp_machine["hostname"] +  " " + str(pd_count))
                if pd_count == 0:
                    mdata['fp_machine'] = temp_machine
                    mdata['fp_resources'] = temp_resources
                    mdata['fp_command'] = mdata['fp'][machine_idx]['command']
                    mdata['fp_group_size'] = mdata['fp'][machine_idx].get(
                        'group_size', 1)
                    pd_flag = True
                    break
                else:
                    pd_count_list.append(pd_count)
            if not pd_flag:
                min_machine_idx = np.argsort(pd_count_list)[0]
                mdata['fp_machine'] = mdata['fp'][min_machine_idx]['machine']
                mdata['fp_resources'] = mdata['fp'][min_machine_idx][
                    'resources']
                mdata['fp_command'] = mdata['fp'][min_machine_idx]['command']
                mdata['fp_group_size'] = mdata['fp'][min_machine_idx].get(
                    'group_size', 1)

            with open("record.machine", "w") as _outfile:
                profile = {}
                profile['purpose'] = 'fp'
                profile['machine'] = mdata['fp_machine']
                profile['resources'] = mdata['fp_resources']
                profile['group_size'] = mdata['fp_group_size']
                profile['command'] = mdata['fp_command']
                json.dump(profile, _outfile, indent=4)
    return mdata