def testEntry(self): entries = [] for i, f in enumerate(self.iter_path): vi = VaspInput.from_directory(f) ls = LabeledSystem(os.path.join(f, 'OUTCAR')) attrib = loadfn(os.path.join(f, 'job.json')) comp = vi['POSCAR'].structure.composition entry = Entry(comp, 'vasp', vi.as_dict(), ls.as_dict(), entry_id='pku-' + str(i), attribute=attrib) entries.append(entry) self.assertEqual(len(entries), len(self.ref_entries)) ret0 = entries[0] r0 = self.ref_entries[0] self.assertEqual(Incar.from_dict(ret0.inputs['INCAR']), Incar.from_dict(r0.inputs['INCAR'])) self.assertEqual(str(r0.inputs['KPOINTS']), str(Kpoints.from_dict(ret0.inputs['KPOINTS']))) self.assertEqual(ret0.inputs['POTCAR'], r0.inputs['POTCAR'].as_dict()) self.assertEqual( Poscar.from_dict(ret0.inputs['POSCAR']).structure, r0.inputs['POSCAR'].structure) self.assertEqual(ret0.entry_id, 'pku-0')
def compute(self, output_dir): outcar = os.path.join(output_dir, 'OUTCAR') if not os.path.isfile(outcar): dlog.warning("cannot find OUTCAR in " + output_dir + " skip") return None else: ls = LabeledSystem(outcar) stress = [] with open(outcar, 'r') as fin: lines = fin.read().split('\n') for line in lines: if 'in kB' in line: stress_xx = float(line.split()[2]) stress_yy = float(line.split()[3]) stress_zz = float(line.split()[4]) stress_xy = float(line.split()[5]) stress_yz = float(line.split()[6]) stress_zx = float(line.split()[7]) stress.append([]) stress[-1].append([stress_xx, stress_xy, stress_zx]) stress[-1].append([stress_xy, stress_yy, stress_yz]) stress[-1].append([stress_zx, stress_yz, stress_zz]) outcar_dict = ls.as_dict() outcar_dict['data']['stress'] = { "@module": "numpy", "@class": "array", "dtype": "float64", "data": stress } return outcar_dict
def build_deepmd(path,nsw): ls = LabeledSystem(os.path.join(path, 'outcar'),fmt='outcar') deepmd = os.path.join(path,'deepmd') if nsw <= 4: # we know nsw must > 100 set_size = 1 print("{0} has only {1}".format(path,nsw)) if nsw > 4: set_size = nsw//4 # 25% used as , but if say 82, then 20, 20, 20, 2, too less ls.to_deepmd_npy(deepmd,set_size=set_size)
def build_deepmd(path,nsw): ls=LabeledSystem(os.path.join(path, 'OUTCAR'),fmt='outcar') deepmd = os.path.join(path,'deepmd') if nsw <= 2000: # we know nsw must > 100 set_size = nsw//2 if nsw > 2000: set_size = 1000 ls.to_deepmd_npy(deepmd,set_size=set_size) if nsw>3000: check_sets(deepmd)
def load(cls, filename, Cls=None): with open(filename, 'r') as f: fc = f.read() jc = json.loads(fc) composition = jc['composition'] calculator = jc['calculator'] if calculator.lower() == 'vasp': try: inputs = VaspInput.from_dict(jc['inputs']).as_dict() except: inputs = jc['inputs'] warnings.warn("""Inproperly configure of POTCAR ! Returned instance cannot be used as input for from_dict() method """) else: if Cls: inputs = Cls.from_dict(jc['inputs']).as_dict() else: raise RuntimeError("inputs decoder must be given") data = LabeledSystem.from_dict(jc['data']).as_dict() attribute = jc['attribute'] entry_id = jc['entry_id'] tag = jc['tag'] return cls(composition, calculator, inputs, data, entry_id, attribute, tag)
def to_system(self, data, **kwargs): """ convert system to list, usefull for data collection """ from dpdata import System, LabeledSystem if 'forces' in data: system = LabeledSystem(data=data) else: system = System(data=data) if len(system) == 0: return [] if len(system) == 1: return [system] else: systems = [] for ii in range(len(system)): systems.append(system.sub_system([ii])) return systems
def test(): from monty.serialization import dumpfn, loadfn from monty.json import MontyDecoder, MontyEncoder from pymatgen.io.vasp.inputs import PotcarSingle, Potcar vi = VaspInput.from_directory('.') ls = LabeledSystem('OUTCAR', fmt='vasp/outcar') en0 = Entry('Al', 'vasp', inputs=vi.as_dict(), data=ls.as_dict(), entry_id='pku-1') print(en0) fname = 'pku-1.json' dumpfn(en0.as_dict(), fname, indent=4) en1 = Entry.load(fname) #vin=VaspInput.from_dict(en1.inputs) #vin.write_input('./new') print(en1) print(en1.as_dict())
def _parsing_vasp(paths, id_prefix, iters=True): entries = [] icount = 0 for path in paths: f_outcar = os.path.join(path, 'OUTCAR') f_job = os.path.join(path, 'job.json') try: vi = VaspInput.from_directory(path) if os.path.isfile(f_job): attrib = loadfn(f_job) else: attrib = {} if iters and attrib: tmp_ = path.split('/')[-1] iter_info = tmp_.split('.')[1] task_info = tmp_.split('.')[-1] attrib['iter_info'] = iter_info attrib['task_info'] = task_info else: pass comp = vi['POSCAR'].structure.composition ls = LabeledSystem(f_outcar) lss = ls.to_list() for ls in lss: if id_prefix: eid = id_prefix + "_" + str(icount) else: eid = str(uuid4()) entry = Entry(comp, 'vasp', vi.as_dict(), ls.as_dict(), attribute=attrib, entry_id=eid) entries.append(entry) icount += 1 except: dlog.info("failed here : %s" % path) return entries
def extract_outcar(outcar): """ extract e, f, v """ ### get confgis that were recalculated ls = LabeledSystem(outcar, fmt='outcar') fp = open(outcar) fp.readline() nsw_sel = fp.readline() if 'nsw_sel' in nsw_sel: print('file generated by merge_out.py') tmp = nsw_sel.split('=')[1].strip().split(' ') nsw_sel = [int(tmp_idx) for tmp_idx in tmp] ### get confgis that were recalculated etot = ls['energies'] nsw = np.array(nsw_sel).astype(int) - 1 # relative nsw, starting from 0 stress = ls['virials'] forces = ls['forces'] return etot, stress, forces, nsw
def build_deepmd(path, nsw, outcar, deepmd): ls = LabeledSystem(outcar, fmt='outcar') """ sub_ls = ls.sub_system(idx) """ if args.idx: print("index file provided") idx = np.loadtxt(args.idx).astype(int) # ls = ls.sub_system(idx) if args.vaspidx: print("vasp index file provided") vaspidx = np.loadtxt(args.vaspidx) fp = open(outcar) fp.readline() nsw_sel = fp.readline() if 'nsw_sel' in nsw_sel: print('file generated by merge_out.py') # print(nsw_sel) tmp = nsw_sel.split('=')[1].strip().split(' ') nsw_sel = [int(tmp_idx) for tmp_idx in tmp] idx = [] for i in range(len(nsw_sel)): if nsw_sel[i] in vaspidx: idx.append(i) else: print('OUTCAR file generated by VASP') idx = vaspidx - 1 idx2 = [i for i in range(len(ls)) if i not in idx] ls2 = ls.sub_system(idx2) ls = ls.sub_system(idx) deepmd = os.path.join(path, deepmd) if args.batchsize: set_size = args.batchsize else: if nsw <= 4: # we know nsw must > 100 set_size = 1 print("{0} has only {1}".format(path, nsw)) if nsw > 4: set_size, _ = best_size( nsw ) # 25% used as , but if say 82, then 20, 20, 20, 2, too less ls.to_deepmd_npy(deepmd, set_size=set_size) if args.test: ls2.to_deepmd_npy('test_tmp', set_size=100000) shutil.copytree('test_tmp/set.000', os.path.join(deepmd, 'set.001')) shutil.rmtree('test_tmp')
from glob import glob from tqdm import tqdm """ process multi systems """ fs = glob('iter.0000[4-7]*/02.fp/task*/OUTCAR') maxf = 3.0 ms = MultiSystems() ic = 0 vacuum_size = 13 for f in tqdm(fs): if check_cluster(f.replace('OUTCAR', 'POSCAR'), vacuum_size, fmt='POSCAR'): print(f) continue try: ls = LabeledSystem(f) except: print(f) continue if len(ls) > 0: if ls.sub_system([0]).data['forces'].max() > maxf: pass else: ic += 1 ms.append(ls) print(len(fs)) print(ic) ms.to_deepmd_raw('deepmd-f%s' % maxf) ms.to_deepmd_npy('deepmd-f%s' % maxf)
def build_deepmd_frames(path, outcar, deepmd): """ sub_ls = ls.sub_system(idx) """ try: ls = LabeledSystem(outcar, fmt=args.format) except: ls = System(outcar, fmt=args.format) if args.exclude: oldsize = len(ls) idx_new = [i for i in range(len(ls)) if i not in args.exclude] ls = ls.sub_system(idx_new) newsize = len(ls) print('{0}/{1} is selected'.format(newsize, oldsize)) if args.force_limit: fmin = min(args.force_limit) fmax = max(args.force_limit) print("force limit imposed, force in between {0}, {1}".format( fmin, fmax)) idx_new = [] exclude = [] for i in range(len(ls)): forces = ls[i].data['forces'] if forces.min() >= fmin and forces.max() <= fmax: idx_new.append(i) else: exclude.append(i) print('excluded frames', exclude) print('{0} / {1} is selected'.format(len(idx_new), len(ls))) ls = ls.sub_system(idx_new) if args.idx: print("index file provided") idx = np.loadtxt(args.idx).astype(int) elif (not args.idx) and args.vaspidx: print("vasp index file provided") vaspidx = np.loadtxt(args.vaspidx) fp = open(outcar) fp.readline() nsw_sel = fp.readline() if 'nsw_sel' in nsw_sel: print('file generated by merge_out.py') tmp = nsw_sel.split('=')[1].strip().split(' ') nsw_sel = [int(tmp_idx) for tmp_idx in tmp] idx = [] for i in range(len(nsw_sel)): if nsw_sel[i] in vaspidx: idx.append(i) else: print('OUTCAR file generated by VASP') idx = vaspidx - 1 else: print("split train and test by ratio {0} : {1}".format( args.train_test_ratio, 1)) train_size = round( len(ls) * (args.train_test_ratio) / (args.train_test_ratio + 1)) idx = np.random.choice(range(len(ls)), train_size, replace=False) idx.sort() idx2 = [i for i in range(len(ls)) if i not in idx] # test ls2 = ls.sub_system(idx2) # test ls = ls.sub_system(idx) deepmd = os.path.join(path, deepmd) ls.to_deepmd_npy(deepmd, set_size=1000000) # give a *large* value, default is 5000 if len(ls2) == 0: print('test set has no data') elif args.savetest and len(ls2) > 0: ls2.to_deepmd_npy('test_tmp', set_size=1000000) shutil.copytree('test_tmp/set.000', os.path.join(deepmd, 'set.001')) shutil.rmtree('test_tmp')
print(force1.min(axis=1)) print(min(force1.min(axis=1)), max(force1.max(axis=1))) print(np.argmin(force1.min(axis=1)), np.argmax(force1.max(axis=1))) #force=np.load('/Users/jiedeng/GD/papers/pv3_crystallization/post_nn/generate_new_poscar/pairs/untitled folder/sisi/deepmd/set.000/force.npy') #max_f = [] #for f in force: # max_f.append(np.max(np.max(f))) #print(max_f) #plt.plot(max_f) tmp = '/Users/jiedeng/GD/papers/pv3_crystallization/post_nn/exsolution_pert/u.project.ESS.lstixrud.jd848.pv+hf.dp-train.lmp_run.6k.rp5.160-cpu.pert.10k_good_p3.recal/deepmd_all' tmp2 = '/Users/jiedeng/GD/papers/pv3_crystallization/post_nn/exsolution_pert/u.home.j.jd848.project-lstixrud.metad.3rd.recal/deepmd/' ls = LabeledSystem(tmp2,fmt='deepmd/npy') from dpdata import LabeledSystem import numpy as np ls = LabeledSystem('deepmd_ttr2',fmt='deepmd/npy') idx = list(range(len(ls))) idx.remove(14) ls2 = ls.sub_system(idx) ls2.to_deepmd_npy('test') fparam = np.load('deepmd_ttr2/set.001/fparam.npy') np.save('test/set.000/fparam.npy',fparam[idx]) cp -r deepmd_ttr2 deepmd_ttr2_tmp rm -r deepmd_ttr2_tmp/set.000 from dpdata import LabeledSystem
import sys sys.path.append( '/Users/jiedeng/Documents/ml/deepmd-kit/my_example/codes/Data.py') from Data import DataSets from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import dpdata vasp_multi_systems = dpdata.MultiSystems.from_dir( dir_name= '/Users/jiedeng/Documents/tmp/jd848/project_folder/pv+hf/3k/solid1/r3-3k/', file_name='OUTCAR', fmt='vasp/outcar') from dpdata import LabeledSystem ls = LabeledSystem( '/Users/jiedeng/Documents/tmp/jd848/project_folder/pv+hf/3k/solid1/r3-3k/OUTCAR', fmt='outcar') print(ls.data['coords'].shape) # (5000, 160, 3) scaler = StandardScaler() """ we get n sample, we need find the most important or representative coord so that them can represent the rest of features. 1) z-score variables 2) eigendecomposition of covariance matrix, covriance matrix should be n*n, not 480*480 3) sort eigenvalues 3) projection of the original normalized data onto the PCA space If we use above protocal, dat should be dat But if we use PCA module directly, input should be dat.T """ ### benchmark
from dpdata import LabeledSystem,MultiSystems from glob import glob """ process multi systems """ try: ls=LabeledSystem('OUTCAR') except: print(f) ms.to_deepmd_raw('deepmd') ms.to_deepmd_npy('deepmd')
from dpdata import System, LabeledSystem, MultiSystems import os fp = open('folders_to_merge', 'r') folders_org = fp.readlines() folders = [] fp.close() for i in range(len(folders_org)): if '#' in folders_org[i] or folders_org[i] == '\n': pass else: folders.append(folders_org[i].replace('\n', '')) for path in folders: pwd = os.getcwd() os.chdir(path) print("process ", path) #s=System('POSCAR',fmt='poscar') ls = LabeledSystem('OUTCAR', fmt='outcar') ls.to_deepmd_raw('deepmd') ls.to_deepmd_npy('deepmd', set_size=1000) os.chdir(pwd) print("done ", path) print("done")
def _parsing_vasp(paths, config_info_dict, id_prefix, iters=True): entries = [] icount = 0 if iters: iter_record = [] iter_record_new = [] try: with open("record.database", "r") as f_record: iter_record = [i.split()[0] for i in f_record.readlines()] iter_record.sort() dlog.info("iter_record") dlog.info(iter_record) except: pass for path in paths: try: f_outcar = os.path.join(path, 'OUTCAR') f_job = os.path.join(path, 'job.json') tmp_iter = path.split('/')[-3] if (tmp_iter in iter_record) and (tmp_iter != iter_record[-1]): continue if tmp_iter not in iter_record_new: iter_record_new.append(tmp_iter) vi = VaspInput.from_directory(path) if os.path.isfile(f_job): attrib = loadfn(f_job) else: attrib = {} if iters and attrib: # generator/Cu/iter.000031/02.fp/task.007.000000 tmp_ = path.split('/')[-1] #config_info=tmp_.split('.')[1] task_info = tmp_.split('.')[-1] tmp_iter = path.split('/')[-3] iter_info = tmp_iter.split('.')[-1] sys_info = path.split('/')[-4] config_info_int = int(tmp_.split('.')[1]) for (key, value) in config_info_dict.items(): if config_info_int in value: config_info = key attrib['config_info'] = config_info attrib['task_info'] = task_info attrib['iter_info'] = iter_info attrib['sys_info'] = sys_info with open(f_outcar, "r") as fin_outcar: infile_outcar = fin_outcar.readlines() for line in infile_outcar: if "running on" in line: attrib["core"] = int(line.split()[2]) if "Elapse" in line: attrib["wall_time"] = float(line.split()[-1]) if "executed on" in line: attrib["date"] = line.split()[-2] attrib["clocktime"] = line.split()[-1] dlog.info("Attrib") dlog.info(attrib) comp = vi['POSCAR'].structure.composition ls = LabeledSystem(f_outcar) lss = ls.to_list() for ls in lss: if id_prefix: eid = id_prefix + "_" + str(icount) else: eid = str(uuid4()) entry = Entry(comp, 'vasp', vi.as_dict(), ls.as_dict(), attribute=attrib, entry_id=eid) entries.append(entry) icount += 1 except Exception: #dlog.info(str(Exception)) dlog.info("failed for %s" % (path)) #pass if iters: iter_record.sort() iter_record_new.sort() with open("record.database", "w") as fw: for line in iter_record: fw.write(line + "\n") for line in iter_record_new: fw.write(line + "\n") return entries
from dpdata import System, LabeledSystem, MultiSystems #s = System('POSCAR', fmt='poscar') #print(s) ls = LabeledSystem('OUTCAR', fmt='outcar') """ if len(ls)%2==0: size = int(len(ls)/2) else: size = int(len(ls)/2) + 1 """ ls.to_deepmd_raw('.') #ls.to_deepmd_npy('deepmd', set_size=size)
from dpdata import LabeledSystem, MultiSystems from glob import glob """ process multi systems """ fs = glob('./*/[0-9]*/OUTCAR') ms = MultiSystems() for f in fs: try: ls = LabeledSystem(f) except: print(f) if len(ls) > 0: ms.append(ls) ms.to_deepmd_raw('deepmd') ms.to_deepmd_npy('deepmd')
from dpdata import LabeledSystem, MultiSystems from glob import glob from tqdm import tqdm """ process multi systems """ fs = glob('iter.0000[3-5]*/02.fp/task*/OUTCAR') maxf = 1.0 ms = MultiSystems() ic = 0 for f in tqdm(fs): try: ls = LabeledSystem(f) except: print(f) if len(ls) > 0: st = ls.to_pymatgen_structure()[0] z = st.cart_coords[:, 2] if st.lattice.c - (z.max() - z.min()) < 14: pass else: if ls.sub_system([0]).data['forces'].max() > maxf: pass else: ic += 1 ms.append(ls) print(len(fs)) print(ic) ms.to_deepmd_raw('deepmd-f%s' % maxf) ms.to_deepmd_npy('deepmd-f%s' % maxf)
from glob import glob from dpdata import LabeledSystem from monty.serialization import dumpfn, loadfn from tqdm import tqdm fs = glob('usefull-[1-3]/sys-*/OUTCAR') entries = [] for f in tqdm(fs): ls = LabeledSystem(f) ls.sub_system([-1]).to_pymatgen_ComputedStructureEntry() entry = ls.sub_system([-1]).to_pymatgen_ComputedStructureEntry()[0] entries.append(entry) dumpfn(entries, 'all-vasp-entries.json')