def gen_init_surf(args): try: import ruamel from monty.serialization import loadfn, dumpfn warnings.simplefilter('ignore', ruamel.yaml.error.MantissaNoDotYAML1_1Warning) jdata = loadfn(args.PARAM) if args.MACHINE is not None: mdata = loadfn(args.MACHINE) except: with open(args.PARAM, 'r') as fp: jdata = json.load(fp) if args.MACHINE is not None: with open(args.MACHINE, "r") as fp: mdata = json.load(fp) out_dir = out_dir_name(jdata) jdata['out_dir'] = out_dir dlog.info("# working dir %s" % out_dir) if args.MACHINE is not None: # Decide a proper machine mdata = decide_fp_machine(mdata) fp_machine = mdata['fp_machine'] fp_ssh_sess = SSHSession(fp_machine) #stage = args.STAGE stage_list = [int(i) for i in jdata['stages']] for stage in stage_list: if stage == 1: create_path(out_dir) make_super_cell_pymatgen(jdata) place_element(jdata) make_vasp_relax(jdata) if args.MACHINE is not None: run_vasp_relax(jdata, mdata, fp_ssh_sess) # elif stage == 0 : # # create_path(out_dir) # # make_super_cell(jdata) # # place_element(jdata) # # make_vasp_relax(jdata) # # make_scale(jdata) # # pert_scaled(jdata) # # poscar_elong('POSCAR', 'POSCAR.out', 3) # pert_scaled(jdata) elif stage == 2: make_scale(jdata) pert_scaled(jdata) else: raise RuntimeError("unknown stage %d" % stage)
def get_machine_info(mdata, task_type): if task_type == "vasp": vasp_exec = mdata['fp_command'] group_size = mdata['fp_group_size'] resources = mdata['fp_resources'] machine = mdata['fp_machine'] machine_type = mdata['fp_machine']['machine_type'] command = vasp_exec command = cmd_append_log(command, "log") elif task_type in lammps_task_type: lmp_exec = mdata['lmp_command'] group_size = mdata['model_devi_group_size'] resources = mdata['model_devi_resources'] machine = mdata['model_devi_machine'] machine_type = mdata['model_devi_machine']['machine_type'] command = lmp_exec + " -i lammps.in" command = cmd_append_log(command, "model_devi.log") ssh_sess = SSHSession(machine) return machine, machine_type, ssh_sess, resources, command, group_size
def _main(): parser = argparse.ArgumentParser(description="gen init confs") parser.add_argument('PARAM', type=str, help="parameter file, json/yaml format") parser.add_argument( "MACHINE", type=str, help="The settings of the machine running the generator") args = parser.parse_args() try: import ruamel from monty.serialization import loadfn, dumpfn warnings.simplefilter('ignore', ruamel.yaml.error.MantissaNoDotYAML1_1Warning) jdata = loadfn(args.PARAM) mdata = loadfn(args.MACHINE) except: with open(args.PARAM, 'r') as fp: jdata = json.load(fp) with open(args.MACHINE, "r") as fp: mdata = json.load(fp) # Selecting a proper machine mdata = decide_fp_machine(mdata) fp_machine = mdata['fp_machine'] fp_ssh_sess = SSHSession(fp_machine) # Decide work path out_dir = out_dir_name(jdata) jdata['out_dir'] = out_dir print("# working dir %s" % out_dir) # Decide whether to use a given poscar from_poscar = False if 'from_poscar' in jdata: from_poscar = jdata['from_poscar'] # Verify md_nstep md_nstep_jdata = jdata["md_nstep"] try: md_incar = jdata['md_incar'] if os.path.isfile(md_incar): with open(md_incar, "r") as fr: md_incar_lines = fr.readlines() nsw_flag = False for incar_line in md_incar_lines: line = incar_line.split() if "NSW" in line: nsw_flag = True nsw_steps = int(incar_line.split()[-1]) break #print("nsw_steps is", nsw_steps) #print("md_nstep_jdata is", md_nstep_jdata) if nsw_flag: if (nsw_steps != md_nstep_jdata): print( "WARNING: your set-up for MD steps in PARAM and md_incar are not consistent!" ) print("MD steps in PARAM is %d" % (md_nstep_jdata)) print("MD steps in md_incar is %d" (nsw_steps)) print("DP-GEN will use settings in md_incar!") jdata['md_nstep'] = nsw_steps except: pass ## correct element name temp_elements = [] for ele in jdata['elements']: temp_elements.append(ele[0].upper() + ele[1:]) jdata['elements'] = temp_elements print("Elements are", jdata['elements']) ## Iteration stage_list = [int(i) for i in jdata['stages']] for stage in stage_list: if stage == 1: print("Current stage is 1, relax") create_path(out_dir) shutil.copy2(args.PARAM, os.path.join(out_dir, 'param.json')) if from_poscar: make_super_cell_poscar(jdata) else: make_unit_cell(jdata) make_super_cell(jdata) place_element(jdata) make_vasp_relax(jdata, mdata) run_vasp_relax(jdata, mdata, fp_ssh_sess) elif stage == 2: print("Current stage is 2, perturb and scale") make_scale(jdata) pert_scaled(jdata) elif stage == 3: print("Current stage is 3, run a short md") make_vasp_md(jdata) run_vasp_md(jdata, mdata, fp_ssh_sess) elif stage == 4: print("Current stage is 4, collect data") coll_vasp_md(jdata) else: raise RuntimeError("unknown stage %d" % stage)
def run_task(json_file, machine_file): with open(json_file, 'r') as fp: jdata = json.load(fp) with open(machine_file, 'r') as fp: mdata = json.load(fp) record = "record.auto_test" model_devi_mdata = decide_model_devi_machine(mdata) model_devi_machine = model_devi_mdata['model_devi_machine'] if ('machine_type' in model_devi_machine) and \ (model_devi_machine['machine_type'] == 'ucloud'): model_devi_ssh_sess = None else: model_devi_ssh_sess = SSHSession(model_devi_machine) fp_mdata = decide_fp_machine(mdata) fp_machine = fp_mdata['fp_machine'] if ('machine_type' in fp_machine) and \ (fp_machine['machine_type'] == 'ucloud'): fp_ssh_sess = None else: fp_ssh_sess = SSHSession(fp_machine) confs = jdata['conf_dir'] ele_list = [key for key in jdata['potcar_map'].keys()] key_id = jdata['key_id'] ii = jdata['task_type'] jj = jdata['task'] task_list = [ 'equi', 'eos', 'elastic', 'vacancy', 'interstitial', 'surf', 'phonon', 'all' ] task_type_list = ['vasp'] + lammps_task_type if jj not in task_list: raise RuntimeError("unknow task %s, something wrong" % jj) if ii not in task_type_list: raise RuntimeError("unknow task type %s, something wrong" % ii) #gen_configuration if 'confs' in confs and (not os.path.exists(confs + '/POSCAR')): print('generate %s' % (ele_list)) if len(ele_list) == 1: gen_confs.gen_element(ele_list[0], key_id) else: gen_confs.gen_alloy(ele_list, key_id) #default task log_iter("gen_equi", ii, "equi") gen_equi(ii, jdata, mdata) log_iter("run_equi", ii, "equi") run_equi(ii, jdata, mdata, model_devi_ssh_sess) log_iter("cmpt_equi", ii, "equi") cmpt_equi(ii, jdata, mdata) if jj == "eos" or jj == "all": log_iter("gen_eos", ii, "eos") gen_eos(ii, jdata, mdata) log_iter("run_eos", ii, "eos") run_eos(ii, jdata, mdata, model_devi_ssh_sess) log_iter("cmpt_eos", ii, "eos") cmpt_eos(ii, jdata, mdata) if jj == "elastic" or jj == "all": log_iter("gen_elastic", ii, "elastic") gen_elastic(ii, jdata, mdata) log_iter("run_elastic", ii, "elastic") run_elastic(ii, jdata, mdata, model_devi_ssh_sess) log_iter("cmpt_elastic", ii, "elastic") cmpt_elastic(ii, jdata, mdata) if jj == "vacancy" or jj == "all": log_iter("gen_vacancy", ii, "vacancy") gen_vacancy(ii, jdata, mdata) log_iter("run_vacancy", ii, "vacancy") run_vacancy(ii, jdata, mdata, model_devi_ssh_sess) log_iter("cmpt_vacancy", ii, "vacancy") cmpt_vacancy(ii, jdata, mdata) if jj == "interstitial" or jj == "all": log_iter("gen_interstitial", ii, "interstitial") gen_interstitial(ii, jdata, mdata) log_iter("run_interstitial", ii, "interstitial") run_interstitial(ii, jdata, mdata, model_devi_ssh_sess) log_iter("cmpt_interstitial", ii, "interstitial") cmpt_interstitial(ii, jdata, mdata) if jj == "surf" or jj == "all": log_iter("gen_surf", ii, "surf") gen_surf(ii, jdata, mdata) log_iter("run_surf", ii, "surf") run_surf(ii, jdata, mdata, model_devi_ssh_sess) log_iter("cmpt_surf", ii, "surf") cmpt_surf(ii, jdata, mdata) ''' if jj=="phonon": log_iter ("gen_phonon", ii, "phonon") gen_phonon (ii, jdata, mdata) log_iter ("run_phonon", ii, "phonon") run_phonon (ii, jdata, mdata,model_devi_ssh_sess) log_iter ("cmpt_phonon", ii, "phonon") cmpt_phonon (ii, jdata, mdata) ''' record_iter(record, confs, ii, jj)
def group_slurm_jobs(ssh_sess, resources, command, work_path, tasks, group_size, forward_common_files, forward_task_files, backward_task_files, remote_job=SlurmJob, forward_task_deference=True): task_chunks = [ [os.path.basename(j) for j in tasks[i:i + group_size]] \ for i in range(0, len(tasks), group_size) ] cwd = os.getcwd() _pmap = PMap(cwd) path_map = _pmap.load() dlog.debug("work_path: %s" % work_path) dlog.debug("curr_path: %s" % cwd) job_list = [] task_chunks_ = ['+'.join(ii) for ii in task_chunks] for ii in task_chunks_: dlog.debug("task_chunk %s" % ii) #dlog.debug(path_map) for ii, chunk in enumerate(task_chunks): # map chunk info. to uniq id chunk_uni = task_chunks_[ii].encode('utf-8') chunk_sha1 = sha1(chunk_uni).hexdigest() if chunk_sha1 in path_map: job_uuid = path_map[chunk_sha1][1].split('/')[-1] dlog.debug("load uuid %s" % job_uuid) else: job_uuid = None rjob = remote_job(ssh_sess, work_path, job_uuid) dlog.debug('uuid %s' % job_uuid) rjob.upload('.', forward_common_files) rjob.upload(chunk, forward_task_files, dereference=forward_task_deference) if job_uuid: rjob.submit(chunk, command, resources=resources, restart=True) else: rjob.submit(chunk, command, resources=resources) job_list.append(rjob) path_map[chunk_sha1] = [rjob.local_root, rjob.remote_root] _pmap.dump(path_map) job_fin = [False for ii in job_list] lcount = [0] * len(job_list) count_fail = 0 while not all(job_fin): for idx, rjob in enumerate(job_list): if not job_fin[idx]: try: status = rjob.check_status() except: ssh_sess = SSHSession(ssh_sess.remote_profile) for _idx, _rjob in enumerate(job_list): job_list[_idx] = SlurmJob(ssh_sess, work_path, _rjob.job_uuid) count_fail = count_fail + 1 dlog.info("ssh_sess failed for %d times" % count_fail) break if status == JobStatus.terminated: lcount[idx] += 1 _job_uuid = rjob.remote_root.split('/')[-1] dlog.info('Job at %s terminated, submit again' % _job_uuid) dlog.debug('try %s times for %s' % (lcount[idx], _job_uuid)) rjob.submit(task_chunks[idx], command, resources=resources, restart=True) if lcount[idx] > 3: dlog.info('Too many errors for ! %s ' % _job_uuid) rjob.download(task_chunks[idx], backward_task_files, back_error=True) rjob.clean() job_fin[idx] = True elif status == JobStatus.finished: rjob.download(task_chunks[idx], backward_task_files) rjob.clean() job_fin[idx] = True time.sleep(10) dlog.debug('error count') dlog.debug(lcount) # delete path map file when job finish _pmap.delete()
def ucloud_submit_jobs(machine, resources, command, work_path, tasks, group_size, forward_common_files, forward_task_files, backward_task_files, forward_task_deference=True): task_chunks = [ [os.path.basename(j) for j in tasks[i:i + group_size]] \ for i in range(0, len(tasks), group_size) ] njob = len(task_chunks) continue_status = False if os.path.isfile("record.machine"): with open("record.machine", "r") as fr: record_machine = json.load(fr) if record_machine["purpose"] == machine[ "purpose"] and record_machine["njob"] == njob: continue_status = True ucloud_machines = record_machine["ucloud_machines"] ucloud_hostids = record_machine["ucloud_hostids"] fr.close() ucloud_url = machine['url'] if continue_status == False: assert machine['machine_type'] == 'ucloud' ucloud_start_param = machine['ucloud_param'] ucloud_start_param['Action'] = "CreateUHostInstance" ucloud_start_param['Name'] = "train" ucloud_start_param['Signature'] = _verfy_ac(machine['Private'], ucloud_start_param) ucloud_machines = [] ucloud_hostids = [] for ii in range(njob): req = requests.get(ucloud_url, ucloud_start_param) if req.json()['RetCode'] != 0: print(json.dumps(req.json(), indent=2, sort_keys=True)) raise RuntimeError("failed to start ucloud machine") ucloud_machines.append(str(req.json()["IPs"][0])) ucloud_hostids.append(str(req.json()["UHostIds"][0])) new_record_machine = {} new_record_machine["purpose"] = machine["purpose"] new_record_machine["njob"] = njob new_record_machine["ucloud_machines"] = ucloud_machines new_record_machine["ucloud_hostids"] = ucloud_hostids with open("record.machine", "w") as fw: json.dump(new_record_machine, fw) fw.close() machine_fin = [False for ii in ucloud_machines] total_machine_num = len(ucloud_machines) fin_machine_num = 0 while not all(machine_fin): for idx, mac in enumerate(ucloud_machines): if not machine_fin[idx]: ucloud_check_param = {} ucloud_check_param['Action'] = "GetUHostInstanceVncInfo" ucloud_check_param['Region'] = machine['ucloud_param'][ 'Region'] ucloud_check_param['UHostId'] = ucloud_hostids[idx] ucloud_check_param['PublicKey'] = machine['ucloud_param'][ 'PublicKey'] ucloud_check_param['Signature'] = _verfy_ac( machine['Private'], ucloud_check_param) req = requests.get(ucloud_url, ucloud_check_param) print("the UHostId is", ucloud_hostids[idx]) print(json.dumps(req.json(), indent=2, sort_keys=True)) if req.json()['RetCode'] == 0: machine_fin[idx] = True fin_machine_num = fin_machine_num + 1 print("Current finish", fin_machine_num, "/", total_machine_num) ucloud_check_param1 = {} ucloud_check_param1['Action'] = "DescribeUHostInstance" ucloud_check_param1['Region'] = machine['ucloud_param']['Region'] ucloud_check_param1["Limit"] = 100 ucloud_check_param1['PublicKey'] = machine['ucloud_param']['PublicKey'] ucloud_check_param1['Signature'] = _verfy_ac(machine['Private'], ucloud_check_param1) req1 = requests.get(ucloud_url, ucloud_check_param1).json() machine_all_fin = True for idx1 in range(int(req1["TotalCount"])): if req1["UHostSet"][idx1]["State"] != "Running": machine_all_fin = False break if machine_all_fin == True: machine_fin = [True for i in machine_fin] time.sleep(10) ssh_sess = [] ssh_param = {} ssh_param['port'] = 22 ssh_param['username'] = '******' ssh_param['work_path'] = machine['work_path'] for ii in ucloud_machines: ssh_param['hostname'] = ii ssh_sess.append(SSHSession(ssh_param)) job_list = [] for ii in range(njob): chunk = task_chunks[ii] print("Current machine is", ucloud_machines[ii]) rjob = CloudMachineJob(ssh_sess[ii], work_path) rjob.upload('.', forward_common_files) rjob.upload(chunk, forward_task_files, dereference=forward_task_deference) rjob.submit(chunk, command, resources=resources) job_list.append(rjob) job_fin = [False for ii in job_list] while not all(job_fin): for idx, rjob in enumerate(job_list): if not job_fin[idx]: status = rjob.check_status() if status == JobStatus.terminated: raise RuntimeError( "find unsuccessfully terminated job on machine" % ucloud_machines[idx]) elif status == JobStatus.finished: rjob.download(task_chunks[idx], backward_task_files) rjob.clean() _ucloud_remove_machine(machine, ucloud_hostids[idx]) job_fin[idx] = True time.sleep(10) os.remove("record.machine")
def decide_model_devi_machine(mdata): if 'model_devi' in mdata: continue_flag = False if 'record.machine' in os.listdir(): try: with open('record.machine', 'r') as _infile: profile = json.load(_infile) if profile['purpose'] == 'model_devi': mdata['model_devi_machine'] = profile['machine'] mdata['model_devi_resources'] = profile['resources'] mdata['lmp_command'] = profile['command'] mdata['model_devi_group_size'] = profile['group_size'] continue_flag = True except: pass pd_count_list = [] pd_flag = False if not continue_flag: #assert isinstance(mdata['model_devi']['machine'], list) #ssert isinstance(mdata['model_devi']['resources'], list) #assert len(mdata['model_devi']['machine']) == len(mdata['model_devi']['resources']) for machine_idx in range(len(mdata['model_devi'])): temp_machine = mdata['model_devi'][machine_idx]['machine'] temp_resources = mdata['model_devi'][machine_idx]['resources'] #assert isinstance(temp_machine, dict), "unsupported type of model_devi machine [%d]!" %machine_idx #assert isinstance(temp_resources, dict), "unsupported type of model_devi resources [%d]!"%machine_idx assert temp_machine[ 'machine_type'] == 'slurm', "Currently only support for Slurm!" temp_ssh_sess = SSHSession(temp_machine) cwd = os.getcwd() temp_rjob = SlurmJob(temp_ssh_sess, cwd) command = temp_rjob._make_squeue(temp_machine, temp_resources) stdin, stdout, stderr = temp_rjob.ssh.exec_command(command) pd_response = stdout.read().decode('utf-8').split("\n") pd_count = len(pd_response) temp_rjob.clean() if pd_count == 0: mdata['model_devi_machine'] = temp_machine mdata['model_devi_resources'] = temp_resources mdata['lmp_command'] = mdata['model_devi'][machine_idx][ 'command'] mdata['model_devi_group_size'] = mdata['model_devi'][ machine_idx]['group_size'] pd_flag = True break else: pd_count_list.append(pd_count) if not pd_flag: min_machine_idx = np.argsort(pd_count_list)[0] mdata['model_devi_machine'] = mdata['model_devi'][ min_machine_idx]['machine'] mdata['model_devi_resources'] = mdata['model_devi'][ min_machine_idx]['resources'] mdata['lmp_command'] = mdata['model_devi'][min_machine_idx][ 'command'] mdata['model_devi_group_size'] = mdata['model_devi'][ min_machine_idx]['group_size'] with open("record.machine", "w") as _outfile: profile = {} profile['purporse'] = 'model_devi' profile['machine'] = mdata['model_devi_machine'] profile['resources'] = mdata['model_devi_resources'] profile['group_size'] = mdata['model_devi_group_size'] profile['command'] = mdata['lmp_command'] json.dump(profile, _outfile, indent=4) return mdata
def decide_train_machine(mdata): if 'train' in mdata: continue_flag = False ## decide whether to use an existing machine if 'record.machine' in os.listdir(): try: with open('record.machine', 'r') as _infile: profile = json.load(_infile) if profile['purpose'] == 'train': mdata['train_machine'] = profile['machine'] mdata['train_resources'] = profile['resources'] mdata['deepmd_path'] = profile['deepmd_path'] continue_flag = True except: pass pd_flag = False pd_count_list = [] # pd for pending job in slurm # if we need to launch new machine_idxines if not continue_flag: #assert isinstance(mdata['train']['machine'], list) #assert isinstance(mdata['train']['resources'], list) #assert len(mdata['train']['machine']) == len(mdata['train']['resources']) # mdata['train'] is a list for machine_idx in range(len(mdata['train'])): temp_machine = mdata['train'][machine_idx]['machine'] temp_resources = mdata['train'][machine_idx]['resources'] #assert isinstance(temp_machine, dict), "unsupported type of train machine [%d]!" %machine_idx #assert isinstance(temp_resources, dict), "unsupported type of train resources [%d]!"%machine_idx assert temp_machine[ 'machine_type'] == 'slurm', "Currently only support for Slurm!" temp_ssh_sess = SSHSession(temp_machine) cwd = os.getcwd() temp_rjob = SlurmJob(temp_ssh_sess, cwd) ## By `squeue -u user -p partition | grep PD` command = temp_rjob._make_squeue(temp_machine, temp_resources) stdin, stdout, stderr = temp_rjob.ssh.exec_command(command) pd_response = stdout.read().decode('utf-8').split("\n") pd_count = len(pd_response) temp_rjob.clean() ## If there is no need to waiting for allocation if pd_count == 1: mdata['train_machine'] = temp_machine mdata['train_resources'] = temp_resources mdata['deepmd_path'] = mdata['train'][machine_idx][ 'deepmd_path'] ## No need to wait pd_flag = True break else: pd_count_list.append(pd_count) if not pd_flag: ## All machines need waiting, then compare waiting jobs ## Select a machine which has fewest waiting jobs min_machine_idx = np.argsort(pd_count_list)[0] mdata['train_machine'] = mdata['train'][min_machine_idx][ 'machine'] mdata['train_resources'] = mdata['train'][min_machine_idx][ 'resources'] mdata['deepmd_path'] = mdata['train'][min_machine_idx][ 'deepmd_path'] ## Record whihc machine is selected with open("record.machine", "w") as _outfile: profile = {} profile['purporse'] = 'train' profile['machine'] = mdata['train_machine'] profile['resources'] = mdata['train_resources'] profile['deepmd_path'] = mdata['deepmd_path'] json.dump(profile, _outfile, indent=4) return mdata